Hrushi02 commited on
Commit
3f9ded5
·
verified ·
1 Parent(s): e0f105d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -46
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import os
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
4
- from peft import PeftModel
5
  import torch
6
 
7
  # Load Hugging Face API token securely
@@ -14,27 +13,22 @@ if not api_token:
14
  base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
15
  peft_model_name = "Hrushi02/Root_Math"
16
 
17
- # Quantization config for 4-bit loading
18
- quantization_config = BitsAndBytesConfig(
 
 
 
19
  load_in_4bit=True,
20
- bnb_4bit_compute_dtype=torch.float16,
21
- bnb_4bit_use_double_quant=True, # Optional: saves extra ~0.4 bits/param
22
  )
23
 
24
- # Load base model with quantization
25
- base_model = AutoModelForCausalLM.from_pretrained(
26
- base_model_name,
27
- quantization_config=quantization_config,
28
- device_map="auto",
29
- token=api_token
30
  )
31
 
32
- # Load fine-tuned model (LoRA adapter)
33
- model = PeftModel.from_pretrained(base_model, peft_model_name, token=api_token)
34
-
35
- # Load tokenizer
36
- tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=api_token)
37
-
38
  # Ensure pad_token is set
39
  if tokenizer.pad_token is None:
40
  tokenizer.pad_token = tokenizer.eos_token
@@ -64,7 +58,7 @@ def respond(
64
  )
65
 
66
  # Tokenize input
67
- inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
68
 
69
  # Generate full response (non-streaming for reliability)
70
  with torch.no_grad():
@@ -80,33 +74,6 @@ def respond(
80
  full_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
81
  yield full_response
82
 
83
- # For token-by-token streaming (matching original), uncomment and use this instead:
84
- # from transformers import TextIteratorStreamer
85
- #
86
- # def respond(...):
87
- # ... (same up to inputs)
88
- #
89
- # response = ""
90
- # streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
91
- # generation_kwargs = {
92
- # **inputs,
93
- # "max_new_tokens": max_tokens,
94
- # "temperature": temperature,
95
- # "top_p": top_p,
96
- # "do_sample": True,
97
- # "pad_token_id": tokenizer.eos_token_id,
98
- # "repetition_penalty": 1.1,
99
- # "streamer": streamer,
100
- # }
101
- # # Run generation in thread for async streaming
102
- # import threading
103
- # thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
104
- # thread.start()
105
- # for new_text in streamer:
106
- # response += new_text
107
- # yield response
108
- # thread.join()
109
-
110
  """
111
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
112
  """
 
1
  import gradio as gr
2
  import os
3
+ from unsloth import FastLanguageModel
 
4
  import torch
5
 
6
  # Load Hugging Face API token securely
 
13
  base_model_name = "unsloth/qwen2.5-math-7b-bnb-4bit"
14
  peft_model_name = "Hrushi02/Root_Math"
15
 
16
+ # Load base model with Unsloth (handles 4-bit quantization automatically)
17
+ model, tokenizer = FastLanguageModel.from_pretrained(
18
+ model_name=base_model_name,
19
+ max_seq_length=2048,
20
+ dtype=torch.float16,
21
  load_in_4bit=True,
22
+ token=api_token,
 
23
  )
24
 
25
+ # Load fine-tuned PEFT adapter
26
+ model = FastLanguageModel.from_pretrained(
27
+ model=base_model, # Use the loaded base
28
+ adapter_name=peft_model_name,
29
+ token=api_token,
 
30
  )
31
 
 
 
 
 
 
 
32
  # Ensure pad_token is set
33
  if tokenizer.pad_token is None:
34
  tokenizer.pad_token = tokenizer.eos_token
 
58
  )
59
 
60
  # Tokenize input
61
+ inputs = tokenizer([prompt], return_tensors="pt")
62
 
63
  # Generate full response (non-streaming for reliability)
64
  with torch.no_grad():
 
74
  full_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
75
  yield full_response
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  """
78
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
79
  """