Spaces:

Hrushi02
/

Root_Math

Sleeping

App Files Files Community

Hrushi02 commited on Oct 15

Commit

3f9ded5

verified ·

1 Parent(s): e0f105d

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -46

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import os
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from peft import PeftModel
 import torch
 # Load Hugging Face API token securely
@@ -14,27 +13,22 @@ if not api_token:
 base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
 peft_model_name = "Hrushi02/Root_Math"
-# Quantization config for 4-bit loading
-quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    bnb_4bit_use_double_quant=True,  # Optional: saves extra ~0.4 bits/param
 )
-# Load base model with quantization
-base_model = AutoModelForCausalLM.from_pretrained(
-    base_model_name,
-    quantization_config=quantization_config,
-    device_map="auto",
-    token=api_token
 )
-# Load fine-tuned model (LoRA adapter)
-model = PeftModel.from_pretrained(base_model, peft_model_name, token=api_token)
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=api_token)
 # Ensure pad_token is set
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
@@ -64,7 +58,7 @@ def respond(
     )
     # Tokenize input
-    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
     # Generate full response (non-streaming for reliability)
     with torch.no_grad():
@@ -80,33 +74,6 @@ def respond(
         full_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
         yield full_response
-# For token-by-token streaming (matching original), uncomment and use this instead:
-# from transformers import TextIteratorStreamer
-#
-# def respond(...):
-#     ... (same up to inputs)
-#
-#     response = ""
-#     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-#     generation_kwargs = {
-#         **inputs,
-#         "max_new_tokens": max_tokens,
-#         "temperature": temperature,
-#         "top_p": top_p,
-#         "do_sample": True,
-#         "pad_token_id": tokenizer.eos_token_id,
-#         "repetition_penalty": 1.1,
-#         "streamer": streamer,
-#     }
-#     # Run generation in thread for async streaming
-#     import threading
-#     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-#     thread.start()
-#     for new_text in streamer:
-#         response += new_text
-#         yield response
-#     thread.join()
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """

 import gradio as gr
 import os
+from unsloth import FastLanguageModel
 import torch
 # Load Hugging Face API token securely
 base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
 peft_model_name = "Hrushi02/Root_Math"
+# Load base model with Unsloth (handles 4-bit quantization automatically)
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=base_model_name,
+    max_seq_length=2048,
+    dtype=torch.float16,
     load_in_4bit=True,
+    token=api_token,
 )
+# Load fine-tuned PEFT adapter
+model = FastLanguageModel.from_pretrained(
+    model=base_model,  # Use the loaded base
+    adapter_name=peft_model_name,
+    token=api_token,
 )
 # Ensure pad_token is set
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     )
     # Tokenize input
+    inputs = tokenizer([prompt], return_tensors="pt")
     # Generate full response (non-streaming for reliability)
     with torch.no_grad():
         full_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
         yield full_response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """