Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForImageTextToText, AutoProcessor | |
| # Load model and processor | |
| MODEL_PATH = "google/gemma-3n-E2B-it" | |
| processor = AutoProcessor.from_pretrained(MODEL_PATH) | |
| model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH, torch_dtype="auto", device_map="auto") | |
| def process_inputs(image, audio): | |
| # Prepare inputs for the model | |
| inputs = processor( | |
| images=image, | |
| audio=audio, | |
| return_tensors="pt" | |
| ).to(model.device, dtype=model.dtype) | |
| # Generate text output | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=256 | |
| ) | |
| # Decode and return text | |
| text = processor.batch_decode( | |
| outputs, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True | |
| )[0] | |
| return text | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=process_inputs, | |
| inputs=[ | |
| gr.Image(label="Upload Image"), | |
| gr.Audio(label="Ask Question about the Image") | |
| ], | |
| outputs=gr.Textbox(label="Answer"), | |
| title="Image and Audio Question Answering", | |
| description="Upload an image as context and ask a quesiton about the image. The model will generate a text response." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |