import gradio as gr import os from threading import Thread import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline # --- 1. Load a Standard CPU-Friendly Model --- # No PEFT model needed. We are loading a pre-trained chat model directly. model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Load the model and tokenizer # No need for tokens if it's a public model. No special settings for CPU. model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) print(f"✅ Model '{model_name}' loaded successfully on CPU!") # --- 2. Create a Pipeline for Easy Inference --- pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) # --- 3. Define the Respond Function for the Chatbot --- # This function takes the user message and history, and generates a response using the pipeline. def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): # Build the prompt using the specific chat template for TinyLlama messages = [{"role": "system", "content": system_message}] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Generate the response # This will be slow on a CPU and will wait for the full response. outputs = pipe( prompt, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, ) # Extract only the generated text from the full output full_response = outputs[0]['generated_text'] # The response includes the prompt, so we split it to get only the new part new_response = full_response.split(prompt)[1] return new_response # --- 4. Launch the Gradio Interface --- demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"), gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], title="TinyLlama 1.1B Chat", description="A simple chatbot running on a CPU-friendly model from Hugging Face." ) if __name__ == "__main__": demo.launch()