Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM | |
| from PIL import Image | |
| import numpy as np | |
| from io import BytesIO | |
| import spaces | |
| # Initialize model globally | |
| model = None | |
| def load_model(): | |
| global model | |
| if model is None: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "moondream/moondream3-preview", | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16, | |
| device_map={"": "cuda"}, | |
| ) | |
| model.compile() | |
| return model | |
| def process_image(image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens): | |
| model = load_model() | |
| settings = { | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "max_tokens": max_tokens | |
| } | |
| results = [] | |
| if task == "Query": | |
| if image is not None: | |
| result = model.query( | |
| image=Image.fromarray(image), | |
| question=question, | |
| reasoning=reasoning, | |
| settings=settings | |
| ) | |
| return result["answer"], None, None | |
| else: | |
| result = model.query( | |
| question=question, | |
| reasoning=reasoning, | |
| settings=settings | |
| ) | |
| return result["answer"], None, None | |
| elif task == "Caption": | |
| if image is None: | |
| return "Please upload an image for captioning", None, None | |
| result = model.caption( | |
| Image.fromarray(image), | |
| length=caption_length.lower(), | |
| settings=settings | |
| ) | |
| return result["caption"], None, None | |
| elif task == "Point": | |
| if image is None: | |
| return "Please upload an image for point detection", None, None | |
| result = model.point(Image.fromarray(image), object_query) | |
| # Visualize points on image | |
| img_with_points = image.copy() | |
| h, w = img_with_points.shape[:2] | |
| points_text = "Points found:\n" | |
| for i, point in enumerate(result.get("points", [])): | |
| x = int(point['x'] * w) | |
| y = int(point['y'] * h) | |
| # Draw a red circle at each point | |
| cv2_available = False | |
| try: | |
| import cv2 | |
| cv2.circle(img_with_points, (x, y), 10, (255, 0, 0), -1) | |
| cv2_available = True | |
| except: | |
| # Fallback to numpy if cv2 not available | |
| for dx in range(-5, 6): | |
| for dy in range(-5, 6): | |
| if dx*dx + dy*dy <= 25: # Circle with radius 5 | |
| px, py = x + dx, y + dy | |
| if 0 <= px < w and 0 <= py < h: | |
| img_with_points[py, px] = [255, 0, 0] | |
| points_text += f"Point {i+1}: x={point['x']:.3f}, y={point['y']:.3f}\n" | |
| return points_text, img_with_points, None | |
| elif task == "Detect": | |
| if image is None: | |
| return "Please upload an image for object detection", None, None | |
| detect_settings = settings.copy() | |
| detect_settings["max_objects"] = 10 | |
| result = model.detect(Image.fromarray(image), object_query, settings=detect_settings) | |
| # Visualize bounding boxes | |
| img_with_boxes = image.copy() | |
| h, w = img_with_boxes.shape[:2] | |
| boxes_text = "Objects detected:\n" | |
| for i, obj in enumerate(result.get("objects", [])): | |
| x_min = int(obj['x_min'] * w) | |
| y_min = int(obj['y_min'] * h) | |
| x_max = int(obj['x_max'] * w) | |
| y_max = int(obj['y_max'] * h) | |
| # Draw bounding box | |
| thickness = 3 | |
| # Top and bottom borders | |
| img_with_boxes[y_min:y_min+thickness, x_min:x_max] = [0, 255, 0] | |
| img_with_boxes[y_max-thickness:y_max, x_min:x_max] = [0, 255, 0] | |
| # Left and right borders | |
| img_with_boxes[y_min:y_max, x_min:x_min+thickness] = [0, 255, 0] | |
| img_with_boxes[y_min:y_max, x_max-thickness:x_max] = [0, 255, 0] | |
| boxes_text += f"Object {i+1}: x_min={obj['x_min']:.3f}, y_min={obj['y_min']:.3f}, x_max={obj['x_max']:.3f}, y_max={obj['y_max']:.3f}\n" | |
| return boxes_text, None, img_with_boxes | |
| with gr.Blocks(title="Moondream 3 Preview", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌙 Moondream 3 Preview - Vision Language Model | |
| Experience the power of Moondream 3, a state-of-the-art vision language model with mixture-of-experts architecture. | |
| This demo showcases all four skills: Query, Caption, Point, and Detect. | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image(label="Upload Image (optional for Query)", type="numpy") | |
| task_type = gr.Radio( | |
| choices=["Query", "Caption", "Point", "Detect"], | |
| value="Query", | |
| label="Select Task" | |
| ) | |
| with gr.Column(visible=True) as query_options: | |
| question_input = gr.Textbox( | |
| label="Question", | |
| placeholder="Ask anything about the image or enter a text-only question", | |
| lines=2 | |
| ) | |
| reasoning_toggle = gr.Checkbox( | |
| label="Enable Reasoning (better for complex questions)", | |
| value=True | |
| ) | |
| with gr.Column(visible=False) as caption_options: | |
| caption_length = gr.Radio( | |
| choices=["Short", "Normal", "Long"], | |
| value="Normal", | |
| label="Caption Length" | |
| ) | |
| with gr.Column(visible=False) as point_detect_options: | |
| object_query_input = gr.Textbox( | |
| label="Object to Find", | |
| placeholder="e.g., 'person wearing red shirt', 'car', 'dog'", | |
| lines=1 | |
| ) | |
| gr.Markdown("### Advanced Settings") | |
| with gr.Accordion("Generation Parameters", open=False): | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=50, | |
| maximum=2048, | |
| value=512, | |
| step=50, | |
| label="Max Tokens" | |
| ) | |
| submit_btn = gr.Button("🚀 Process", variant="primary") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="Output", | |
| lines=10, | |
| show_copy_button=True | |
| ) | |
| output_image_points = gr.Image( | |
| label="Visualization (Points)", | |
| visible=False | |
| ) | |
| output_image_boxes = gr.Image( | |
| label="Visualization (Bounding Boxes)", | |
| visible=False | |
| ) | |
| def update_interface(task): | |
| return { | |
| query_options: gr.Column(visible=(task == "Query")), | |
| caption_options: gr.Column(visible=(task == "Caption")), | |
| point_detect_options: gr.Column(visible=(task in ["Point", "Detect"])), | |
| output_image_points: gr.Image(visible=False), | |
| output_image_boxes: gr.Image(visible=False) | |
| } | |
| def process_and_update_visibility(image, task, question, caption_length, object_query, reasoning, temperature, top_p, max_tokens): | |
| text_output, points_img, boxes_img = process_image( | |
| image, task, question, caption_length, object_query, reasoning, | |
| temperature, top_p, max_tokens | |
| ) | |
| return { | |
| output_text: text_output, | |
| output_image_points: gr.Image(value=points_img, visible=(points_img is not None)), | |
| output_image_boxes: gr.Image(value=boxes_img, visible=(boxes_img is not None)) | |
| } | |
| task_type.change( | |
| update_interface, | |
| inputs=[task_type], | |
| outputs=[query_options, caption_options, point_detect_options, output_image_points, output_image_boxes] | |
| ) | |
| submit_btn.click( | |
| process_and_update_visibility, | |
| inputs=[ | |
| input_image, task_type, question_input, caption_length, | |
| object_query_input, reasoning_toggle, temperature, top_p, max_tokens | |
| ], | |
| outputs=[output_text, output_image_points, output_image_boxes] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [None, "Query", "Explain the concept of neural networks", "Normal", "", True, 0.7, 0.95, 512], | |
| [None, "Query", "What is the capital of France?", "Normal", "", False, 0.3, 0.95, 256], | |
| ], | |
| inputs=[ | |
| input_image, task_type, question_input, caption_length, | |
| object_query_input, reasoning_toggle, temperature, top_p, max_tokens | |
| ], | |
| label="Example Queries" | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### About Moondream 3 | |
| - **Architecture**: 9B total parameters, 2B active, with mixture-of-experts | |
| - **Skills**: Query (Q&A), Caption, Point detection, Object detection | |
| - **Features**: 32K context length, multi-crop high resolution processing | |
| - **Model**: [moondream/moondream3-preview](https://huggingface.co/moondream/moondream3-preview) | |
| ### Tips: | |
| - **Query**: Ask open-ended questions about images or use for text-only tasks | |
| - **Caption**: Generate short, normal, or long descriptions of images | |
| - **Point**: Find specific objects and get their coordinates | |
| - **Detect**: Get bounding boxes for objects in images | |
| - Enable reasoning for complex visual understanding tasks | |
| """ | |
| ) | |
| demo.launch() |