ariG23498's picture
ariG23498 HF Staff
adding logic
59ec2ed
raw
history blame
1.29 kB
import spaces
import gradio as gr
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
# Load model and processor
MODEL_PATH = "google/gemma-3n-E2B-it"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH, torch_dtype="auto", device_map="auto")
@spaces.GPU
def process_inputs(image, audio):
# Prepare inputs for the model
inputs = processor(
images=image,
audio=audio,
return_tensors="pt"
).to(model.device, dtype=model.dtype)
# Generate text output
outputs = model.generate(
**inputs,
max_new_tokens=256
)
# Decode and return text
text = processor.batch_decode(
outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)[0]
return text
# Gradio interface
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Image(label="Upload Image"),
gr.Audio(label="Ask Question about the Image")
],
outputs=gr.Textbox(label="Answer"),
title="Image and Audio Question Answering",
description="Upload an image as context and ask a quesiton about the image. The model will generate a text response."
)
if __name__ == "__main__":
iface.launch()