Qwen3-VL-8B-Instruct

Running on Zero

App Files Files Community

Qwen3-VL-8B-Instruct / app.py

akhaliq HF Staff

Update Gradio app with multiple files

bc8a02f verified 15 days ago

raw

history blame

9.41 kB

	import spaces
	import gradio as gr
	import torch
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from PIL import Image
	import numpy as np
	from typing import List, Dict, Any, Optional, Tuple
	import io
	import base64

	# Initialize the model and processor
	model_id = "Qwen/Qwen2-VL-2B-Instruct" # Using 2B version for better performance on Spaces

	# Load model with optimizations for inference
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)
	processor = AutoProcessor.from_pretrained(model_id)

	@spaces.GPU(duration=60)
	def process_chat_message(
	message: str,
	image: Optional[Image.Image],
	history: List[Dict[str, Any]]
	) -> str:
	"""
	Process a chat message with optional image input using Qwen3-VL model.

	Args:
	message: The user's text message
	image: Optional PIL Image
	history: Chat history

	Returns:
	The model's response
	"""
	# Prepare the message content
	content = []

	# Add image if provided
	if image is not None:
	# Convert PIL image to format expected by the model
	content.append({"type": "image", "image": image})

	# Add text message
	if message:
	content.append({"type": "text", "text": message})

	# Create the messages format for the model
	messages = []

	# Add history if exists (text only for simplicity)
	for hist_item in history:
	if hist_item["role"] == "user":
	messages.append({
	"role": "user",
	"content": hist_item.get("content", "")
	})
	elif hist_item["role"] == "assistant":
	messages.append({
	"role": "assistant",
	"content": hist_item.get("content", "")
	})

	# Add current message
	if content:
	messages.append({
	"role": "user",
	"content": content
	})

	# Prepare inputs for the model
	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	if image is not None:
	inputs = processor(
	text=[text],
	images=[image],
	return_tensors="pt"
	).to(model.device)
	else:
	inputs = processor(
	text=[text],
	return_tensors="pt"
	).to(model.device)

	# Generate response
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.7,
	do_sample=True,
	top_p=0.95
	)

	# Decode the generated response
	generated_ids_trimmed = [
	out_ids[len(in_ids):]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]

	response = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return response

	def chat_fn(message: Dict[str, Any], history: List[List[Any]]) -> Tuple[str, List[List[Any]]]:
	"""
	Main chat function that processes user input and returns response.

	Args:
	message: Dictionary containing text and optional files
	history: Chat history as list of [user_msg, assistant_msg] pairs

	Returns:
	Empty string and updated history
	"""
	text = message.get("text", "")
	files = message.get("files", [])

	# Process image if provided
	image = None
	if files and len(files) > 0:
	try:
	image = Image.open(files[0])
	# Convert RGBA to RGB if necessary
	if image.mode == "RGBA":
	background = Image.new("RGB", image.size, (255, 255, 255))
	background.paste(image, mask=image.split()[3])
	image = background
	except Exception as e:
	print(f"Error loading image: {e}")
	image = None

	# Convert history to format expected by model
	model_history = []
	for user_msg, assistant_msg in history:
	if isinstance(user_msg, dict):
	model_history.append({"role": "user", "content": user_msg.get("text", "")})
	elif isinstance(user_msg, str):
	model_history.append({"role": "user", "content": user_msg})

	if assistant_msg:
	model_history.append({"role": "assistant", "content": assistant_msg})

	# Get response from model
	try:
	response = process_chat_message(text, image, model_history)
	except Exception as e:
	response = f"Sorry, I encountered an error: {str(e)}"

	# Update history
	if image is not None:
	# Store message with image indicator
	user_message = {"text": text, "image": "[Image uploaded]"}
	else:
	user_message = text

	history.append([user_message, response])

	return "", history

	def retry_fn(history: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]:
	"""Retry the last message."""
	if not history or len(history) < 2:
	return "", history

	# Remove last assistant response and regenerate
	last_user_msg = history[-2]
	history = history[:-1]

	# Recreate the message dict
	if isinstance(last_user_msg["content"], dict):
	message = {"text": last_user_msg["content"].get("text", "")}
	else:
	message = {"text": last_user_msg["content"]}

	return chat_fn(message, history)

	def undo_fn(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Undo the last message."""
	if history:
	return history[:-2] if len(history) >= 2 else []
	return history

	def clear_fn() -> Tuple[None, List]:
	"""Clear the chat."""
	return None, []

	# Create the Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
	gr.Markdown(
	"""
	# 🌟 Qwen3-VL Multimodal Chat

	Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images!

	Features:
	- 📝 Text conversations
	- 🖼️ Image understanding and analysis
	- 🎨 Visual question answering
	- 🔍 Detailed image descriptions

	[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown(
	"""
	### 💡 Tips:
	- Upload an image and ask questions about it
	- Try asking for detailed descriptions
	- Ask about objects, colors, text in images
	- Compare elements within the image
	"""
	)

	gr.Markdown(
	"""
	### 📸 Example Prompts:
	- "What's in this image?"
	- "Describe this scene in detail"
	- "What text can you see?"
	- "Count the objects in the image"
	- "What's the mood of this image?"
	"""
	)

	with gr.Column(scale=3):
	chatbot = gr.Chatbot(
	label="Chat",
	type="messages",
	height=500,
	show_copy_button=True,
	bubble_full_width=False,
	avatar_images=[None, "🤖"],
	value=[]
	)

	with gr.Row():
	msg = gr.MultimodalTextbox(
	label="Message",
	placeholder="Type a message or upload an image...",
	file_types=["image"],
	submit_btn=True,
	stop_btn=False
	)

	with gr.Row():
	retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm")
	undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	gr.Markdown(
	"""
	Model Information:
	- Model: Qwen3-VL-4B-Instruct
	- Optimized for vision-language tasks
	- Supports multiple languages
	- Best performance with clear, well-lit images
	"""
	)

	# Set up event handlers
	msg.submit(
	chat_fn,
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True
	)

	retry_btn.click(
	retry_fn,
	inputs=[chatbot],
	outputs=[msg, chatbot],
	queue=True
	)

	undo_btn.click(
	undo_fn,
	inputs=[chatbot],
	outputs=[chatbot],
	queue=False
	)

	clear_btn.click(
	clear_fn,
	outputs=[msg, chatbot],
	queue=False
	)

	# Add examples
	gr.Examples(
	examples=[
	{"text": "Hello! What can you help me with today?"},
	{"text": "Can you describe an image if I upload one?"},
	{"text": "What are your capabilities?"},
	],
	inputs=msg,
	label="Example Messages"
	)

	if __name__ == "__main__":
	demo.launch(
	show_error=True,
	share=False,
	debug=True
	)