Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig | |
| from PIL import Image | |
| import requests | |
| from io import BytesIO | |
| import spaces # Import spaces for ZeroGPU support | |
| # Load the model and processor | |
| repo_name = "allenai/Molmo-7B-D-0924" | |
| arguments = { | |
| "device_map": "auto", # Device will be set automatically | |
| "torch_dtype": "auto", # Use appropriate precision | |
| "trust_remote_code": True # Allow loading remote code | |
| } | |
| # Load the processor (this part doesn't need GPU yet) | |
| processor = AutoProcessor.from_pretrained(repo_name, **arguments) | |
| # Define the function for image description | |
| # This ensures the function gets GPU access when needed | |
| def describe_image(image, question): | |
| # Load the model inside the function and move it to GPU | |
| model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments).to('cuda') | |
| # Process the uploaded image along with the user's question | |
| inputs = processor.process( | |
| images=[image], | |
| text=question if question else "Describe this image in great detail without missing any piece of information" | |
| ) | |
| # Move inputs to model device (GPU) | |
| inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()} | |
| # Generate output using the model on GPU | |
| output = model.generate_from_batch( | |
| inputs, | |
| GenerationConfig(max_new_tokens=2048, stop_strings="<|endoftext|>"), | |
| tokenizer=processor.tokenizer, | |
| ) | |
| # Decode the generated tokens | |
| generated_tokens = output[0, inputs["input_ids"].size(1):] | |
| generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| return generated_text | |
| # Gradio interface | |
| def gradio_app(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Long Image Description with Molmo-7B-D-0924\n### Upload an image and ask a question about it!") | |
| with gr.Row(): | |
| image_input = gr.Image(type="pil", label="Upload an Image") | |
| question_input = gr.Textbox(placeholder="Ask a question about the image (e.g., 'What is happening in this image?')", label="Question (Optional)") | |
| output_text = gr.Textbox(label="Image Description", interactive=False) | |
| # Submit button to generate the description | |
| submit_btn = gr.Button("Generate Description") | |
| # Callback to run when submit button is clicked | |
| submit_btn.click( | |
| fn=describe_image, | |
| inputs=[image_input, question_input], | |
| outputs=output_text | |
| ) | |
| # Launch the Gradio interface | |
| demo.launch() | |
| # Launch the Gradio app | |
| gradio_app() | |