Spaces:

Hrushi02
/

Root_Math

Sleeping

App Files Files Community

Root_Math / app.py

Hrushi02

Update app.py

d004974 verified 27 days ago

raw

history blame contribute delete

2.76 kB

	import gradio as gr
	import os
	from threading import Thread
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline

	# --- 1. Load a Standard CPU-Friendly Model ---
	# No PEFT model needed. We are loading a pre-trained chat model directly.
	model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	# Load the model and tokenizer
	# No need for tokens if it's a public model. No special settings for CPU.
	model = AutoModelForCausalLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	print(f"✅ Model '{model_name}' loaded successfully on CPU!")

	# --- 2. Create a Pipeline for Easy Inference ---
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	)

	# --- 3. Define the Respond Function for the Chatbot ---
	# This function takes the user message and history, and generates a response using the pipeline.
	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	# Build the prompt using the specific chat template for TinyLlama
	messages = [{"role": "system", "content": system_message}]
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})

	prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# Generate the response
	# This will be slow on a CPU and will wait for the full response.
	outputs = pipe(
	prompt,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	)

	# Extract only the generated text from the full output
	full_response = outputs[0]['generated_text']
	# The response includes the prompt, so we split it to get only the new part
	new_response = full_response.split(prompt)[1]

	return new_response

	# --- 4. Launch the Gradio Interface ---
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
	gr.Slider(minimum=10, maximum=512, value=128, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	title="TinyLlama 1.1B Chat",
	description="A simple chatbot running on a CPU-friendly model from Hugging Face."
	)

	if __name__ == "__main__":
	demo.launch()