Spaces:

Hrushi02
/

Root_Math

Sleeping

App Files Files Community

Hrushi02 commited on Oct 15

Commit

646b139

verified ·

1 Parent(s): 8db5361

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -50

app.py CHANGED Viewed

@@ -1,19 +1,55 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-from transformers import AutoTokenizer
-# Initialize client and tokenizer
-client = InferenceClient()
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-def respond(message, history):
-    # Build messages list from history + new message
-    messages = []
-    for user_msg, assistant_msg in history:
-        if user_msg:
-            messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     # Apply chat template
@@ -21,44 +57,50 @@ def respond(message, history):
         messages, tokenize=False, add_generation_prompt=True
     )
-    # Generate response (non-streaming; for streaming, use generator below)
-    response = client.text_generation(
-        prompt,
-        model="HuggingFaceH4/zephyr-7b-beta",
-        max_new_tokens=256,
-        temperature=0.7,
-        do_sample=True,
-        top_k=50,
-        top_p=0.95,
-        repetition_penalty=1.1,
-        return_full_text=False  # Only new tokens
-    )
-    return response
-# For streaming (if your original used stream=True), replace the generation with:
-# def respond(message, history):
-#     ... (same messages and prompt)
-#     for chunk in client.text_generation(
-#         prompt,
-#         model="HuggingFaceH4/zephyr-7b-beta",
-#         max_new_tokens=256,
-#         temperature=0.7,
-#         do_sample=True,
-#         top_k=50,
-#         top_p=0.95,
-#         repetition_penalty=1.1,
-#         return_full_text=False,
-#         stream=True
-#     ):
-#         yield chunk  # Yield chunks for Gradio streaming
-# Gradio interface with fixed chatbot format
 demo = gr.ChatInterface(
     respond,
-    chatbot=gr.Chatbot(type="messages"),  # Fixes deprecation warning
     title="Root Math Chatbot",
-    description="Ask math questions about roots and equations!"
 )
 if __name__ == "__main__":
-    demo.launch()

+```python
 import gradio as gr
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+import torch
+# Load Hugging Face API token securely
+api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+if not api_token:
+    raise ValueError("❌ ERROR: Hugging Face API token is not set. Please set it as an environment variable.")
+# Define model names
+base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
+peft_model_name = "Hrushi02/Root_Math"
+# Load base model with authentication
+base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_name,
+    torch_dtype=torch.float16,
+    device_map="auto",
+    use_auth_token=api_token  # ✅ Correct
+)
+# Load fine-tuned model
+model = PeftModel.from_pretrained(base_model, peft_model_name, token=api_token)
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=api_token)
+# Ensure pad_token is set
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+):
+    # Build messages list
+    messages = [{"role": "system", "content": system_message}]
+    for val in history:
+        if val[0]:
+            messages.append({"role": "user", "content": val[0]})
+        if val[1]:
+            messages.append({"role": "assistant", "content": val[1]})
     messages.append({"role": "user", "content": message})
     # Apply chat template
         messages, tokenize=False, add_generation_prompt=True
     )
+    # Tokenize input
+    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+    # Generate response with streaming
+    with torch.no_grad():
+        for new_token in model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+            repetition_penalty=1.1,
+            streamer=None,  # We'll handle streaming manually
+        ):
+            # Decode the new token
+            new_token_decoded = tokenizer.decode(new_token[-1:], skip_special_tokens=True)
+            yield new_token_decoded
+# Note: For true token-by-token streaming in Gradio, the above yields per-token.
+# If you want full sentence streaming, accumulate and yield periodically, but this matches the original's per-token yield.
+"""
+For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
+"""
 demo = gr.ChatInterface(
     respond,
+    additional_inputs=[
+        gr.Textbox(value="You are a helpful math assistant specialized in solving equations and finding roots.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)",
+        ),
+    ],
+    chatbot=gr.Chatbot(type="messages"),  # Modern format to avoid deprecation
     title="Root Math Chatbot",
+    description="A fine-tuned Qwen2.5-Math model for solving roots and math problems."
 )
 if __name__ == "__main__":
+    demo.launch()
+```