Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
| from PIL import Image | |
| import numpy as np | |
| from typing import List, Dict, Any, Optional, Tuple | |
| import io | |
| import base64 | |
| # Initialize the model and processor | |
| model_id = "Qwen/Qwen2-VL-2B-Instruct" # Using 2B version for better performance on Spaces | |
| # Load model with optimizations for inference | |
| model = Qwen2VLForConditionalGeneration.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| def process_chat_message( | |
| message: str, | |
| image: Optional[Image.Image], | |
| history: List[Dict[str, Any]] | |
| ) -> str: | |
| """ | |
| Process a chat message with optional image input using Qwen3-VL model. | |
| Args: | |
| message: The user's text message | |
| image: Optional PIL Image | |
| history: Chat history | |
| Returns: | |
| The model's response | |
| """ | |
| # Prepare the message content | |
| content = [] | |
| # Add image if provided | |
| if image is not None: | |
| # Convert PIL image to format expected by the model | |
| content.append({"type": "image", "image": image}) | |
| # Add text message | |
| if message: | |
| content.append({"type": "text", "text": message}) | |
| # Create the messages format for the model | |
| messages = [] | |
| # Add history if exists (text only for simplicity) | |
| for hist_item in history: | |
| if hist_item["role"] == "user": | |
| messages.append({ | |
| "role": "user", | |
| "content": hist_item.get("content", "") | |
| }) | |
| elif hist_item["role"] == "assistant": | |
| messages.append({ | |
| "role": "assistant", | |
| "content": hist_item.get("content", "") | |
| }) | |
| # Add current message | |
| if content: | |
| messages.append({ | |
| "role": "user", | |
| "content": content | |
| }) | |
| # Prepare inputs for the model | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| if image is not None: | |
| inputs = processor( | |
| text=[text], | |
| images=[image], | |
| return_tensors="pt" | |
| ).to(model.device) | |
| else: | |
| inputs = processor( | |
| text=[text], | |
| return_tensors="pt" | |
| ).to(model.device) | |
| # Generate response | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.95 | |
| ) | |
| # Decode the generated response | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] | |
| for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| response = processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False | |
| )[0] | |
| return response | |
| def chat_fn(message: Dict[str, Any], history: List[List[Any]]) -> Tuple[str, List[List[Any]]]: | |
| """ | |
| Main chat function that processes user input and returns response. | |
| Args: | |
| message: Dictionary containing text and optional files | |
| history: Chat history as list of [user_msg, assistant_msg] pairs | |
| Returns: | |
| Empty string and updated history | |
| """ | |
| text = message.get("text", "") | |
| files = message.get("files", []) | |
| # Process image if provided | |
| image = None | |
| if files and len(files) > 0: | |
| try: | |
| image = Image.open(files[0]) | |
| # Convert RGBA to RGB if necessary | |
| if image.mode == "RGBA": | |
| background = Image.new("RGB", image.size, (255, 255, 255)) | |
| background.paste(image, mask=image.split()[3]) | |
| image = background | |
| except Exception as e: | |
| print(f"Error loading image: {e}") | |
| image = None | |
| # Convert history to format expected by model | |
| model_history = [] | |
| for user_msg, assistant_msg in history: | |
| if isinstance(user_msg, dict): | |
| model_history.append({"role": "user", "content": user_msg.get("text", "")}) | |
| elif isinstance(user_msg, str): | |
| model_history.append({"role": "user", "content": user_msg}) | |
| if assistant_msg: | |
| model_history.append({"role": "assistant", "content": assistant_msg}) | |
| # Get response from model | |
| try: | |
| response = process_chat_message(text, image, model_history) | |
| except Exception as e: | |
| response = f"Sorry, I encountered an error: {str(e)}" | |
| # Update history | |
| if image is not None: | |
| # Store message with image indicator | |
| user_message = {"text": text, "image": "[Image uploaded]"} | |
| else: | |
| user_message = text | |
| history.append([user_message, response]) | |
| return "", history | |
| def retry_fn(history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]: | |
| """Retry the last message.""" | |
| if not history or len(history) < 2: | |
| return "", history | |
| # Remove last assistant response and regenerate | |
| last_user_msg = history[-2] | |
| history = history[:-1] | |
| # Recreate the message dict | |
| if isinstance(last_user_msg["content"], dict): | |
| message = {"text": last_user_msg["content"].get("text", "")} | |
| else: | |
| message = {"text": last_user_msg["content"]} | |
| return chat_fn(message, history) | |
| def undo_fn(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Undo the last message.""" | |
| if history: | |
| return history[:-2] if len(history) >= 2 else [] | |
| return history | |
| def clear_fn() -> Tuple[None, List]: | |
| """Clear the chat.""" | |
| return None, [] | |
| # Create the Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo: | |
| gr.Markdown( | |
| """ | |
| # π Qwen3-VL Multimodal Chat | |
| Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images! | |
| **Features:** | |
| - π Text conversations | |
| - πΌοΈ Image understanding and analysis | |
| - π¨ Visual question answering | |
| - π Detailed image descriptions | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| """ | |
| ### π‘ Tips: | |
| - Upload an image and ask questions about it | |
| - Try asking for detailed descriptions | |
| - Ask about objects, colors, text in images | |
| - Compare elements within the image | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### πΈ Example Prompts: | |
| - "What's in this image?" | |
| - "Describe this scene in detail" | |
| - "What text can you see?" | |
| - "Count the objects in the image" | |
| - "What's the mood of this image?" | |
| """ | |
| ) | |
| with gr.Column(scale=3): | |
| chatbot = gr.Chatbot( | |
| label="Chat", | |
| type="messages", | |
| height=500, | |
| show_copy_button=True, | |
| bubble_full_width=False, | |
| avatar_images=[None, "π€"], | |
| value=[] | |
| ) | |
| with gr.Row(): | |
| msg = gr.MultimodalTextbox( | |
| label="Message", | |
| placeholder="Type a message or upload an image...", | |
| file_types=["image"], | |
| submit_btn=True, | |
| stop_btn=False | |
| ) | |
| with gr.Row(): | |
| retry_btn = gr.Button("π Retry", variant="secondary", size="sm") | |
| undo_btn = gr.Button("β©οΈ Undo", variant="secondary", size="sm") | |
| clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="sm") | |
| with gr.Accordion("βοΈ Advanced Settings", open=False): | |
| gr.Markdown( | |
| """ | |
| **Model Information:** | |
| - Model: Qwen3-VL-4B-Instruct | |
| - Optimized for vision-language tasks | |
| - Supports multiple languages | |
| - Best performance with clear, well-lit images | |
| """ | |
| ) | |
| # Set up event handlers | |
| msg.submit( | |
| chat_fn, | |
| inputs=[msg, chatbot], | |
| outputs=[msg, chatbot], | |
| queue=True | |
| ) | |
| retry_btn.click( | |
| retry_fn, | |
| inputs=[chatbot], | |
| outputs=[msg, chatbot], | |
| queue=True | |
| ) | |
| undo_btn.click( | |
| undo_fn, | |
| inputs=[chatbot], | |
| outputs=[chatbot], | |
| queue=False | |
| ) | |
| clear_btn.click( | |
| clear_fn, | |
| outputs=[msg, chatbot], | |
| queue=False | |
| ) | |
| # Add examples | |
| gr.Examples( | |
| examples=[ | |
| {"text": "Hello! What can you help me with today?"}, | |
| {"text": "Can you describe an image if I upload one?"}, | |
| {"text": "What are your capabilities?"}, | |
| ], | |
| inputs=msg, | |
| label="Example Messages" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| show_error=True, | |
| share=False, | |
| debug=True | |
| ) |