Spaces:

ariG23498
/

gemma3n-image-audio

Runtime error

ariG23498 HF Staff commited on Jun 30

Commit

59ec2ed

1 Parent(s): 2395e7b

adding logic

Files changed (3) hide show

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Gemma3n Image Audio
 emoji: 😻
-colorFrom: purple
 colorTo: purple
 sdk: gradio
 sdk_version: 5.35.0

 ---
+title: Gemma3n Visual (Audio) Question Answering
 emoji: 😻
+colorFrom: blue
 colorTo: purple
 sdk: gradio
 sdk_version: 5.35.0

app.py CHANGED Viewed

@@ -1,7 +1,47 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import spaces
 import gradio as gr
+import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
+# Load model and processor
+MODEL_PATH = "google/gemma-3n-E2B-it"
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH, torch_dtype="auto", device_map="auto")
+@spaces.GPU
+def process_inputs(image, audio):
+    # Prepare inputs for the model
+    inputs = processor(
+        images=image,
+        audio=audio,
+        return_tensors="pt"
+    ).to(model.device, dtype=model.dtype)
+    # Generate text output
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=256
+    )
+    # Decode and return text
+    text = processor.batch_decode(
+        outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True
+    )[0]
+    return text
+# Gradio interface
+iface = gr.Interface(
+    fn=process_inputs,
+    inputs=[
+        gr.Image(label="Upload Image"),
+        gr.Audio(label="Ask Question about the Image")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="Image and Audio Question Answering",
+    description="Upload an image as context and ask a quesiton about the image. The model will generate a text response."
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

+spaces
+gradio
+transformers=4.53.0