Spaces:
Running
Running
| import base64 | |
| import re | |
| from io import BytesIO | |
| from typing import List, Tuple, Optional | |
| import gradio as gr | |
| import requests | |
| from PIL import Image | |
| from huggingface_hub import InferenceClient | |
| # Hugging Face Inference Client (uses the free Inference API) | |
| client = InferenceClient(model="Qwen/Qwen2.5-VL-32B-Instruct", provider="hf-inference") | |
| BOX_TAG_PATTERN = r"<box>\((\d+),(\d+),(\d+),(\d+)\):([^<]+)</box>" | |
| def parse_bounding_boxes(text: str) -> List[Tuple[Tuple[int, int, int, int], str]]: | |
| matches = re.findall(BOX_TAG_PATTERN, text) | |
| return [((int(x1), int(y1), int(x2), int(y2)), label.strip()) for x1, y1, x2, y2, label in matches] | |
| def fetch_image_from_url(url: str) -> Image.Image: | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| return Image.open(BytesIO(resp.content)).convert("RGB") | |
| def pil_to_data_uri(img: Image.Image) -> str: | |
| buffer = BytesIO() | |
| img.save(buffer, format="PNG") | |
| return "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode() | |
| def predict(image: Optional[Image.Image], image_url: str): | |
| if image is None and not image_url: | |
| return None, "β Please provide an image or URL." | |
| if image is None: | |
| try: | |
| image = fetch_image_from_url(image_url) | |
| data_uri = image_url | |
| except Exception as e: | |
| return None, f"β {e}" | |
| else: | |
| image = image.convert("RGB") | |
| data_uri = pil_to_data_uri(image) | |
| prompt = ( | |
| "Detect all objects in the provided image and output their bounding box " | |
| "coordinates and class labels in the format <box>(x1,y1,x2,y2):class_label</box>. " | |
| "If multiple objects are detected, list each bounding box and class label in a new <box> tag. " | |
| "Do not include any other text or descriptions." | |
| ) | |
| stream = client.chat.completions.create( | |
| messages=[ | |
| {"role": "user", "content": [ | |
| {"type": "text", "text": prompt}, | |
| {"type": "image_url", "image_url": {"url": data_uri}}, | |
| ]} | |
| ], | |
| stream=True, | |
| ) | |
| response_text = "".join(chunk.choices[0].delta.content or "" for chunk in stream) | |
| bboxes = parse_bounding_boxes(response_text) | |
| if not bboxes: | |
| return None, "β οΈ No objects detected." | |
| annotations = [(bbox, label) for bbox, label in bboxes] | |
| return (image, annotations), "β Detection complete." | |
| def build_demo(): | |
| theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald") | |
| with gr.Blocks(theme=theme, title="Qwen Object Detection Demo") as demo: | |
| gr.Markdown("## Qwen2.5βVL Object Detection Demo π―") | |
| gr.Markdown("Upload an image **or** paste an image URL, then click **Detect Objects π**.") | |
| gr.Markdown("[Check out the model](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)") | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload Image"): | |
| img_input = gr.Image(type="pil", label="Upload Image", height=300) | |
| gr.Examples( | |
| examples=[ | |
| ["./example_images/example_1.png"], | |
| ["./example_images/example_2.jpg"], | |
| ], | |
| inputs=[img_input], | |
| label="Click an example to try π", | |
| ) | |
| with gr.TabItem("Image URL"): | |
| url_input = gr.Textbox(label="Image URL", placeholder="https://example.com/img.jpg") | |
| gr.Examples( | |
| examples=[ | |
| [None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/google-cloud/model-card.png"], | |
| [None, "http://images.cocodataset.org/val2017/000000039769.jpg"], | |
| ], | |
| inputs=[img_input, url_input], | |
| label="Click an example to try π", | |
| ) | |
| detect_btn = gr.Button("Detect Objects π") | |
| output_img = gr.AnnotatedImage(label="Detections", height=600) | |
| status = gr.Markdown() | |
| detect_btn.click(predict, inputs=[img_input, url_input], outputs=[output_img, status]) | |
| return demo | |
| def main(): | |
| demo = build_demo() | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() | |