Spaces:

docilio
/

3nhance

Sleeping

App Files Files Community

Tiago Caldeira commited on Aug 4

Commit

9f37a6e

1 Parent(s): 44e14ac

different approach using unsloth

Browse files

Files changed (3) hide show

_app.py +75 -0
app.py +29 -36
requirements.txt +5 -4

_app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import gradio as gr
+from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+import torch
+import textwrap
+from huggingface_hub import login
+import os
+# Log in using the HF token (automatically read from secret)
+hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
+login(token=hf_token)
+# 🔄 Load model and processor
+model_id = "google/gemma-3n-e2b-it"
+model_id = "google/gemma-3n-E2B"
+model_id = "lmstudio-community/gemma-3n-E2B-it-MLX-4bit"
+model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
+processor = AutoProcessor.from_pretrained(model_id)
+model = Gemma3nForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.float32,
+    device_map="cpu"
+).eval()
+# 🛠️ Helper to format output
+def print_response(text: str) -> str:
+    return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
+# 🔍 Inference function for text-only input
+def predict_text(system_prompt: str, user_prompt: str) -> str:
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
+        {"role": "user", "content": [{"type": "text", "text": user_prompt.strip()}]},
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    input_len = inputs["input_ids"].shape[-1]
+    with torch.inference_mode():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=500,
+            do_sample=False,
+            use_cache=False  # 🔥 Fixes CPU bug
+        )
+    gen = output[0][input_len:]
+    decoded = processor.decode(gen, skip_special_tokens=True)
+    return print_response(decoded)
+# 🎛️ Gradio Interface
+demo = gr.Interface(
+    fn=predict_text,
+    inputs=[
+        gr.Textbox(lines=2, label="System Prompt", value="You are a helpful assistant."),
+        gr.Textbox(lines=4, label="User Prompt", placeholder="Ask something..."),
+    ],
+    outputs=gr.Textbox(label="Gemma 3n Response"),
+    title="Gemma 3n Text-Only Chat",
+    description="Interact with the Gemma 3n language model using plain text. Image input not required.",
+)
+if __name__ == "__main__":
+    demo.launch()

app.py CHANGED Viewed

@@ -1,63 +1,56 @@
-import gradio as gr
-from transformers import AutoProcessor, Gemma3nForConditionalGeneration
 import torch
 import textwrap
-from huggingface_hub import login
-import os
-# Log in using the HF token (automatically read from secret)
-hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
-login(token=hf_token)
-# 🔄 Load model and processor
-model_id = "google/gemma-3n-e2b-it"
-model_id = "google/gemma-3n-E2B"
-model_id = "lmstudio-community/gemma-3n-E2B-it-MLX-4bit"
-model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
-processor = AutoProcessor.from_pretrained(model_id)
-model = Gemma3nForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float32,
-    device_map="cpu"
-).eval()
-# 🛠️ Helper to format output
 def print_response(text: str) -> str:
     return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
-# 🔍 Inference function for text-only input
 def predict_text(system_prompt: str, user_prompt: str) -> str:
     messages = [
         {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
         {"role": "user", "content": [{"type": "text", "text": user_prompt.strip()}]},
     ]
-    inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
-        return_tensors="pt"
-    ).to(model.device)
-    input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
-        output = model.generate(
             **inputs,
-            max_new_tokens=500,
-            do_sample=False,
-            use_cache=False  # 🔥 Fixes CPU bug
         )
-    gen = output[0][input_len:]
-    decoded = processor.decode(gen, skip_special_tokens=True)
     return print_response(decoded)
-# 🎛️ Gradio Interface
 demo = gr.Interface(
     fn=predict_text,
     inputs=[
@@ -66,7 +59,7 @@ demo = gr.Interface(
     ],
     outputs=gr.Textbox(label="Gemma 3n Response"),
     title="Gemma 3n Text-Only Chat",
-    description="Interact with the Gemma 3n language model using plain text. Image input not required.",
 )
 if __name__ == "__main__":

 import torch
+import gradio as gr
+from unsloth import FastModel
+from transformers import TextStreamer, AutoTokenizer
 import textwrap
+# Load model (4-bit quantized)
+model, tokenizer = FastModel.from_pretrained(
+    model_name = "unsloth/gemma-3n-E4B-it",
+    dtype = None,  # Auto-detect FP16/32
+    max_seq_length = 1024,
+    load_in_4bit = True,
+    full_finetuning = False,
+    # token = "hf_..."  # Uncomment if model is gated
+)
+model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# 🛠️ Format output
 def print_response(text: str) -> str:
     return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
+# 🔍 Inference function for Gradio
 def predict_text(system_prompt: str, user_prompt: str) -> str:
     messages = [
         {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
         {"role": "user", "content": [{"type": "text", "text": user_prompt.strip()}]},
     ]
+    inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
+        return_tensors="pt",
+    ).to(device)
     with torch.inference_mode():
+        outputs = model.generate(
             **inputs,
+            max_new_tokens=256,
+            temperature=1.0,
+            top_p=0.95,
+            top_k=64,
         )
+    generated = outputs[0][inputs["input_ids"].shape[-1]:]
+    decoded = tokenizer.decode(generated, skip_special_tokens=True)
     return print_response(decoded)
+# 🎛️ Gradio UI
 demo = gr.Interface(
     fn=predict_text,
     inputs=[
     ],
     outputs=gr.Textbox(label="Gemma 3n Response"),
     title="Gemma 3n Text-Only Chat",
+    description="Interact with the Gemma 3n language model using plain text. 4-bit quantized for efficiency.",
 )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-transformers>=4.42.0
-torch
 gradio
-accelerate
 timm
-bitsandbytes

+#transformers>=4.42.0
+#torch
 gradio
+#accelerate
 timm
+#bitsandbytes
+unsloth