File size: 2,759 Bytes
7ee65b2 af57c50 d004974 d43dadc d004974 1513153 d004974 1513153 d004974 af57c50 d004974 af57c50 1513153 d004974 d43dadc d004974 d43dadc d004974 d43dadc af57c50 d004974 af57c50 d004974 af57c50 d43dadc af57c50 d004974 7ee65b2 d004974 b8c533f 7ee65b2 d004974 af57c50 d43dadc 7ee65b2 d004974 7ee65b2 af57c50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
import os
from threading import Thread
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
# --- 1. Load a Standard CPU-Friendly Model ---
# No PEFT model needed. We are loading a pre-trained chat model directly.
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Load the model and tokenizer
# No need for tokens if it's a public model. No special settings for CPU.
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"✅ Model '{model_name}' loaded successfully on CPU!")
# --- 2. Create a Pipeline for Easy Inference ---
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
# --- 3. Define the Respond Function for the Chatbot ---
# This function takes the user message and history, and generates a response using the pipeline.
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Build the prompt using the specific chat template for TinyLlama
messages = [{"role": "system", "content": system_message}]
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Generate the response
# This will be slow on a CPU and will wait for the full response.
outputs = pipe(
prompt,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
# Extract only the generated text from the full output
full_response = outputs[0]['generated_text']
# The response includes the prompt, so we split it to get only the new part
new_response = full_response.split(prompt)[1]
return new_response
# --- 4. Launch the Gradio Interface ---
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
title="TinyLlama 1.1B Chat",
description="A simple chatbot running on a CPU-friendly model from Hugging Face."
)
if __name__ == "__main__":
demo.launch() |