File size: 2,759 Bytes
7ee65b2
af57c50
 
 
d004974
d43dadc
d004974
 
 
1513153
d004974
 
 
 
1513153
d004974
af57c50
d004974
 
 
 
 
af57c50
1513153
d004974
 
d43dadc
 
 
 
 
 
 
 
d004974
d43dadc
d004974
 
 
d43dadc
af57c50
d004974
af57c50
d004974
 
 
 
af57c50
d43dadc
 
af57c50
 
d004974
 
 
 
 
 
 
7ee65b2
d004974
b8c533f
 
7ee65b2
d004974
 
af57c50
d43dadc
 
 
 
 
 
 
7ee65b2
d004974
 
7ee65b2
 
 
af57c50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
import os
from threading import Thread
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline

# --- 1. Load a Standard CPU-Friendly Model ---
# No PEFT model needed. We are loading a pre-trained chat model directly.
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load the model and tokenizer
# No need for tokens if it's a public model. No special settings for CPU.
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"✅ Model '{model_name}' loaded successfully on CPU!")

# --- 2. Create a Pipeline for Easy Inference ---
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# --- 3. Define the Respond Function for the Chatbot ---
# This function takes the user message and history, and generates a response using the pipeline.
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Build the prompt using the specific chat template for TinyLlama
    messages = [{"role": "system", "content": system_message}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate the response
    # This will be slow on a CPU and will wait for the full response.
    outputs = pipe(
        prompt,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )
    
    # Extract only the generated text from the full output
    full_response = outputs[0]['generated_text']
    # The response includes the prompt, so we split it to get only the new part
    new_response = full_response.split(prompt)[1]
    
    return new_response

# --- 4. Launch the Gradio Interface ---
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
        gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    title="TinyLlama 1.1B Chat",
    description="A simple chatbot running on a CPU-friendly model from Hugging Face."
)

if __name__ == "__main__":
    demo.launch()