Spaces:

docilio
/

3nhance

Sleeping

App Files Files Community

Tiago Caldeira commited on Aug 4

Commit

3d582fc

1 Parent(s): 9f37a6e

different approach using unsloth model

Browse files

Files changed (1) hide show

app.py +24 -29

app.py CHANGED Viewed

@@ -1,28 +1,27 @@
 import torch
 import gradio as gr
-from unsloth import FastModel
-from transformers import TextStreamer, AutoTokenizer
 import textwrap
-# Load model (4-bit quantized)
-model, tokenizer = FastModel.from_pretrained(
-    model_name = "unsloth/gemma-3n-E4B-it",
-    dtype = None,  # Auto-detect FP16/32
-    max_seq_length = 1024,
-    load_in_4bit = True,
-    full_finetuning = False,
-    # token = "hf_..."  # Uncomment if model is gated
 )
 model.eval()
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-# 🛠️ Format output
 def print_response(text: str) -> str:
     return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
-# 🔍 Inference function for Gradio
 def predict_text(system_prompt: str, user_prompt: str) -> str:
     messages = [
         {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
@@ -34,23 +33,24 @@ def predict_text(system_prompt: str, user_prompt: str) -> str:
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
-        return_tensors="pt",
-    ).to(device)
     with torch.inference_mode():
-        outputs = model.generate(
             **inputs,
-            max_new_tokens=256,
-            temperature=1.0,
-            top_p=0.95,
-            top_k=64,
         )
-    generated = outputs[0][inputs["input_ids"].shape[-1]:]
     decoded = tokenizer.decode(generated, skip_special_tokens=True)
     return print_response(decoded)
-# 🎛️ Gradio UI
 demo = gr.Interface(
     fn=predict_text,
     inputs=[
@@ -58,10 +58,5 @@ demo = gr.Interface(
         gr.Textbox(lines=4, label="User Prompt", placeholder="Ask something..."),
     ],
     outputs=gr.Textbox(label="Gemma 3n Response"),
-    title="Gemma 3n Text-Only Chat",
-    description="Interact with the Gemma 3n language model using plain text. 4-bit quantized for efficiency.",
-)
-if __name__ == "__main__":
-    demo.launch()

 import torch
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 import textwrap
+model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load model in full precision on CPU — no bitsandbytes
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="cpu",            # Force CPU
+    torch_dtype=torch.float32,   # Use FP32 to ensure CPU compatibility
 )
 model.eval()
+# Helper to format response nicely
 def print_response(text: str) -> str:
     return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
+# Inference function for Gradio
 def predict_text(system_prompt: str, user_prompt: str) -> str:
     messages = [
         {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
+        return_tensors="pt"
+    ).to("cpu")
+    input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
+        output = model.generate(
             **inputs,
+            max_new_tokens=300,
+            do_sample=False,
+            use_cache=False  # Important for CPU compatibility
         )
+    generated = output[0][input_len:]
     decoded = tokenizer.decode(generated, skip_special_tokens=True)
     return print_response(decoded)
+# Gradio UI
 demo = gr.Interface(
     fn=predict_text,
     inputs=[
         gr.Textbox(lines=4, label="User Prompt", placeholder="Ask something..."),
     ],
     outputs=gr.Textbox(label="Gemma 3n Response"),
+    title="Gemma 3n Chat (CPU-friendly