Spaces:

Slait
/

Caption_Qwen-vl

Running

File size: 29,208 Bytes

import gradio as gr
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import random
import os
import warnings
from typing import List, Tuple, Optional

# Suppress specific warnings
warnings.filterwarnings('ignore', message='.*meta device.*')

# Multi-language support
TRANSLATIONS = {
    "en": {
        "title": "Qwen VL Image Description Generator",
        "header": "🖼️ Image Description Generator based on Qwen Vision Language Models",
        "subtitle": "Upload an image and enter a prompt to generate a description using Qwen VL models.",
        "language": "Language",
        "language_info": "Select language",
        "model_selection": "Model Selection",
        "model_info": "Select a model for generating descriptions",
        "advanced_params": "⚙️ Advanced Parameters",
        "max_tokens": "Max New Tokens",
        "max_tokens_info": "Maximum number of tokens to generate",
        "temperature": "Temperature",
        "temperature_info": "Controls randomness of generation",
        "top_p": "Top-p (nucleus sampling)",
        "top_p_info": "Probability threshold for token sampling",
        "top_k": "Top-k",
        "top_k_info": "Number of most probable tokens to consider",
        "seed": "Seed",
        "seed_info": "Seed for reproducibility (-1 for random)",
        "random_seed_btn": "🎲 Random Seed",
        "single_processing": "📄 Single Processing",
        "batch_processing": "📚 Batch Processing",
        "upload_image": "Upload Image",
        "prompt": "Prompt",
        "prompt_placeholder": "For example: Create a product description for online store",
        "generate_btn": "🚀 Generate Description",
        "result": "Result",
        "upload_images": "Upload Images",
        "prompts_multiline": "Prompts (one per line)",
        "prompts_placeholder": "Create a product description for online store\nCreate SEO Description for product\n...",
        "prompts_info": "Specify one prompt for all images or one prompt per image",
        "process_batch_btn": "🚀 Process Batch",
        "results": "Results",
        "examples_title": "💡 Example Prompts:",
        "example_1": "Create a product description for online store",
        "example_2": "Create an SEO description for a product with a maximum of 160 characters.",
        "example_3": "Create an attractive product description for marketplace",
        "example_4": "Describe image in detail for product catalog",
        "error_no_image": "Please upload an image",
        "error_no_prompt": "Please enter a prompt",
        "error_no_images": "Please upload images",
        "error_no_prompts": "Please enter prompts (one per line)",
        "error_prompt_mismatch": "Number of prompts ({}) does not match number of images ({}). Specify either one prompt for all images or one prompt per image.",
        "error_generation": "Error generating description: {}",
        "loading_model": "Loading model: {}",
        "model_loaded": "Model {} successfully loaded on {}",
        "image_label": "=== Image {}: {} ===",
        "prompt_label": "Prompt: {}",
        "result_label": "Result: {}",
        "model_size_warning": "⚠️ Note: Large models (8B+) may use CPU offloading if GPU memory is insufficient, which can slow down generation."
    },
    "ru": {
        "title": "Генератор описаний изображений Qwen VL",
        "header": "🖼️ Генератор описаний изображений на основе Qwen Vision Language Models",
        "subtitle": "Загрузите изображение и введите промт для генерации описания с помощью моделей Qwen VL.",
        "language": "Язык",
        "language_info": "Выберите язык",
        "model_selection": "Выбор модели",
        "model_info": "Выберите модель для генерации описаний",
        "advanced_params": "⚙️ Расширенные параметры",
        "max_tokens": "Макс. количество новых токенов",
        "max_tokens_info": "Максимальное количество токенов для генерации",
        "temperature": "Температура",
        "temperature_info": "Контролирует случайность генерации",
        "top_p": "Top-p (nucleus sampling)",
        "top_p_info": "Вероятностный порог для выборки токенов",
        "top_k": "Top-k",
        "top_k_info": "Количество наиболее вероятных токенов для рассмотрения",
        "seed": "Seed",
        "seed_info": "Seed для воспроизводимости (-1 для случайного)",
        "random_seed_btn": "🎲 Случайный seed",
        "single_processing": "📄 Одиночная обработка",
        "batch_processing": "📚 Пакетная обработка",
        "upload_image": "Загрузите изображение",
        "prompt": "Промт",
        "prompt_placeholder": "Например: Создать описание товара для онлайн магазина",
        "generate_btn": "🚀 Генерировать описание",
        "result": "Результат",
        "upload_images": "Загрузите изображения",
        "prompts_multiline": "Промты (по одному на строку)",
        "prompts_placeholder": "Создать описание товара для онлайн магазина\nСоздать SEO Description для товара\n...",
        "prompts_info": "Укажите один промт для всех изображений или по одному промту на каждое изображение",
        "process_batch_btn": "🚀 Обработать пакет",
        "results": "Результаты",
        "examples_title": "💡 Примеры промтов:",
        "example_1": "Создать описание товара ''  на русском языке",
        "example_2": "Создать SEO Description для товара максимум 160 символов на русском языке",
        "example_3": "Создать привлекательное описание продукта для маркетплейса на русском языке",
        "example_4": "Детально описать изображение для каталога товаров на русском языке",
        "error_no_image": "Пожалуйста, загрузите изображение",
        "error_no_prompt": "Пожалуйста, введите промт",
        "error_no_images": "Пожалуйста, загрузите изображения",
        "error_no_prompts": "Пожалуйста, введите промты (по одному на строку)",
        "error_prompt_mismatch": "Количество промтов ({}) не совпадает с количеством изображений ({}). Укажите либо один промт для всех изображений, либо по одному промту на каждое изображение.",
        "error_generation": "Ошибка при генерации описания: {}",
        "loading_model": "Загрузка модели: {}",
        "model_loaded": "Модель {} успешно загружена на {}",
        "image_label": "=== Изображение {}: {} ===",
        "prompt_label": "Промт: {}",
        "result_label": "Результат: {}",
        "model_size_warning": "⚠️ Примечание: Большие модели (8B+) могут использовать выгрузку на CPU при недостатке памяти GPU, что может замедлить генерацию."
    },
    "zh": {
        "title": "Qwen VL 图像描述生成器",
        "header": "🖼️ 基于 Qwen Vision Language Models 的图像描述生成器",
        "subtitle": "上传图像并输入提示词，使用 Qwen VL 模型生成描述。",
        "language": "语言",
        "language_info": "选择语言",
        "model_selection": "模型选择",
        "model_info": "选择用于生成描述的模型",
        "advanced_params": "⚙️ 高级参数",
        "max_tokens": "最大新令牌数",
        "max_tokens_info": "生成的最大令牌数",
        "temperature": "温度",
        "temperature_info": "控制生成的随机性",
        "top_p": "Top-p（核采样）",
        "top_p_info": "令牌采样的概率阈值",
        "top_k": "Top-k",
        "top_k_info": "考虑的最可能令牌数",
        "seed": "随机种子",
        "seed_info": "用于可重现性的种子（-1 表示随机）",
        "random_seed_btn": "🎲 随机种子",
        "single_processing": "📄 单张处理",
        "batch_processing": "📚 批量处理",
        "upload_image": "上传图像",
        "prompt": "提示词",
        "prompt_placeholder": "例如：为在线商店创建产品描述",
        "generate_btn": "🚀 生成描述",
        "result": "结果",
        "upload_images": "上传图像",
        "prompts_multiline": "提示词（每行一个）",
        "prompts_placeholder": "为在线商店创建产品描述\n为产品创建SEO描述\n...",
        "prompts_info": "为所有图像指定一个提示词，或为每个图像指定一个提示词",
        "process_batch_btn": "🚀 处理批次",
        "results": "结果",
        "examples_title": "💡 示例提示词：",
        "example_1": "为在线商店创建产品描述",
        "example_2": "为产品创建SEO描述最多 160 个字符",
        "example_3": "为市场创建有吸引力的产品描述",
        "example_4": "详细描述产品目录的图像",
        "error_no_image": "请上传图像",
        "error_no_prompt": "请输入提示词",
        "error_no_images": "请上传图像",
        "error_no_prompts": "请输入提示词（每行一个）",
        "error_prompt_mismatch": "提示词数量（{}）与图像数量（{}）不匹配。请为所有图像指定一个提示词，或为每个图像指定一个提示词。",
        "error_generation": "生成描述时出错：{}",
        "loading_model": "正在加载模型：{}",
        "model_loaded": "模型 {} 已成功加载到 {}",
        "image_label": "=== 图像 {}: {} ===",
        "prompt_label": "提示词：{}",
        "result_label": "结果：{}",
        "model_size_warning": "⚠️ 注意：如果 GPU 内存不足，大型模型（8B+）可能会使用 CPU 卸载，这可能会减慢生成速度。"
    }
}

# Default language
current_language = "en"

def get_text(key: str) -> str:
    """Get translated text for the current language"""
    return TRANSLATIONS[current_language].get(key, key)

class ImageDescriptionGenerator:
    def __init__(self):
        self.model = None
        self.processor = None
        self.current_model_name = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
    def load_model(self, model_name: str):
        """Загрузка модели только если она еще не загружена или изменилась"""
        if self.current_model_name == model_name and self.model is not None:
            return
        
        print(get_text("loading_model").format(model_name))
        
        # Предупреждение о больших моделях
        if "8B" in model_name or "4B" in model_name:
            print(get_text("model_size_warning"))
        
        # Освобождаем память от предыдущей модели
        if self.model is not None:
            del self.model
            del self.processor
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        # Загружаем новую модель с подавлением предупреждений
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            self.model = Qwen3VLForConditionalGeneration.from_pretrained(
                model_name,
                dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                device_map="auto"
            )
            self.processor = AutoProcessor.from_pretrained(model_name)
        
        self.current_model_name = model_name
        print(get_text("model_loaded").format(model_name, self.device))
    
    def generate_description(
        self,
        image_path: str,
        prompt: str,
        model_name: str,
        max_new_tokens: int = 1024,
        temperature: float = 0.6,
        top_p: float = 0.9,
        top_k: int = 50,
        seed: int = -1
    ) -> str:
        """Генерация описания для одного изображения"""
        try:
            # Загружаем модель если необходимо
            self.load_model(model_name)
            
            # Устанавливаем seed если указан
            if seed != -1:
                torch.manual_seed(seed)
                if torch.cuda.is_available():
                    torch.cuda.manual_seed(seed)
            
            # Подготавливаем сообщения для модели
            messages = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "image": image_path,
                        },
                        {"type": "text", "text": prompt},
                    ],
                }
            ]
            
            # Подготавливаем текст для модели
            text = self.processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            
            # Обрабатываем изображение и текст
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to(self.device)
            
            # Генерируем ответ
            with torch.no_grad():
                generated_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    top_k=top_k,
                    do_sample=True if temperature > 0 else False
                )
            
            # Декодируем результат
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = self.processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )
            
            return output_text[0]
            
        except Exception as e:
            return get_text("error_generation").format(str(e))

# Создаем глобальный экземпляр генератора
generator = ImageDescriptionGenerator()

def process_single_image(
    image,
    prompt: str,
    model_name: str,
    max_new_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    seed: int
) -> str:
    """Обработка одного изображения"""
    if image is None:
        return get_text("error_no_image")
    
    if not prompt.strip():
        return get_text("error_no_prompt")
    
    # Сохраняем временное изображение если это numpy array
    if hasattr(image, 'shape'):
        temp_path = "temp_image.jpg"
        Image.fromarray(image).save(temp_path)
        image_path = temp_path
    else:
        image_path = image
    
    result = generator.generate_description(
        image_path=image_path,
        prompt=prompt,
        model_name=model_name,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        seed=seed
    )
    
    # Удаляем временный файл
    if hasattr(image, 'shape') and os.path.exists(temp_path):
        os.remove(temp_path)
    
    return result

def process_batch_images(
    files: List,
    prompts_text: str,
    model_name: str,
    max_new_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    seed: int
) -> str:
    """Обработка пакета изображений"""
    if not files:
        return get_text("error_no_images")
    
    if not prompts_text.strip():
        return get_text("error_no_prompts")
    
    # Разбиваем промты по строкам
    prompts = [p.strip() for p in prompts_text.split('\n') if p.strip()]
    
    if len(prompts) == 1:
        # Если один промт, используем его для всех изображений
        prompts = prompts * len(files)
    elif len(prompts) != len(files):
        return get_text("error_prompt_mismatch").format(len(prompts), len(files))
    
    results = []
    for idx, (file, prompt) in enumerate(zip(files, prompts), 1):
        image_path = file.name if hasattr(file, 'name') else file
        result = generator.generate_description(
            image_path=image_path,
            prompt=prompt,
            model_name=model_name,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            seed=seed if seed == -1 else seed + idx - 1  # Разный seed для каждого изображения
        )
        results.append(get_text("image_label").format(idx, os.path.basename(image_path)) + "\n")
        results.append(get_text("prompt_label").format(prompt) + "\n")
        results.append(get_text("result_label").format(result) + "\n\n")
    
    return "".join(results)

def random_seed() -> int:
    """Генерация случайного seed"""
    return random.randint(0, 2**32 - 1)

def update_examples():
    return [
        [get_text("example_1")],
        [get_text("example_2")],
        [get_text("example_3")],
        [get_text("example_4")]
    ]

def create_interface():
    """Create Gradio interface with current language"""
    with gr.Blocks(title=get_text("title"), theme=gr.themes.Soft()) as demo:
        # Header that will be updated
        header_md = gr.Markdown(f"""
        # {get_text("header")}
        
        {get_text("subtitle")}
        """)
        
        # Общие настройки - модель и язык в одном ряду
        with gr.Row():
            model_dropdown = gr.Dropdown(
                choices=[
                    "Qwen/Qwen3-VL-2B-Instruct",
                    "Qwen/Qwen3-VL-4B-Instruct",
                    "Qwen/Qwen3-VL-8B-Instruct",
                    "Qwen/Qwen3-VL-32B-Instruct",
                    "Qwen/Qwen3-VL-2B-Thinking",
                    "Qwen/Qwen3-VL-4B-Thinking",
                    "Qwen/Qwen3-VL-8B-Thinking",
                    "Qwen/Qwen3-VL-32B-Thinking",
                ],
                value="Qwen/Qwen3-VL-2B-Instruct",
                label=get_text("model_selection"),
                info=get_text("model_info"),
                scale=3
            )
            language_dropdown = gr.Dropdown(
                choices=[("English", "en"), ("Русский", "ru"), ("中文", "zh")],
                value=current_language,
                label=get_text("language"),
                info=get_text("language_info"),
                scale=1
            )
        
        # Расширенные параметры
        advanced_accordion = gr.Accordion(get_text("advanced_params"), open=False)
        with advanced_accordion:
            with gr.Row():
                max_tokens_slider = gr.Slider(
                    minimum=1,
                    maximum=4096,
                    value=1024,
                    step=1,
                    label=get_text("max_tokens"),
                    info=get_text("max_tokens_info")
                )
                temperature_slider = gr.Slider(
                    minimum=0.1,
                    maximum=2.0,
                    value=0.6,
                    step=0.1,
                    label=get_text("temperature"),
                    info=get_text("temperature_info")
                )
            
            with gr.Row():
                top_p_slider = gr.Slider(
                    minimum=0.05,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    label=get_text("top_p"),
                    info=get_text("top_p_info")
                )
                top_k_slider = gr.Slider(
                    minimum=1,
                    maximum=1000,
                    value=50,
                    step=1,
                    label=get_text("top_k"),
                    info=get_text("top_k_info")
                )
            
            with gr.Row():
                seed_number = gr.Number(
                    value=-1,
                    label=get_text("seed"),
                    info=get_text("seed_info"),
                    precision=0
                )
                random_seed_btn = gr.Button(get_text("random_seed_btn"), size="sm")
        
        # Вкладки для одиночной и пакетной обработки
        tabs = gr.Tabs()
        with tabs:
            # Вкладка одиночной обработки
            single_tab = gr.TabItem(get_text("single_processing"))
            with single_tab:
                with gr.Row():
                    with gr.Column(scale=1):
                        single_image = gr.Image(
                            type="numpy",
                            label=get_text("upload_image"),
                            height=350
                        )
                        single_prompt = gr.Textbox(
                            label=get_text("prompt"),
                            placeholder=get_text("prompt_placeholder"),
                            lines=3
                        )
                        single_submit_btn = gr.Button(get_text("generate_btn"), variant="primary")
                    
                    with gr.Column(scale=1):
                        single_output = gr.Textbox(
                            label=get_text("result"),
                            lines=15,
                            show_copy_button=True
                        )
                
                # Кликабельные примеры промтов
                examples_title = gr.Markdown(f"### {get_text('examples_title')}")
                single_examples = gr.Dataset(
                    components=[single_prompt],
                    samples=update_examples(),
                    type="values"
                )
            
            # Вкладка пакетной обработки
            batch_tab = gr.TabItem(get_text("batch_processing"))
            with batch_tab:
                with gr.Row():
                    with gr.Column(scale=1):
                        batch_images = gr.File(
                            file_count="multiple",
                            label=get_text("upload_images"),
                            file_types=["image"]
                        )
                        batch_prompts = gr.Textbox(
                            label=get_text("prompts_multiline"),
                            placeholder=get_text("prompts_placeholder"),
                            lines=5,
                            info=get_text("prompts_info")
                        )
                        batch_submit_btn = gr.Button(get_text("process_batch_btn"), variant="primary")
                    
                    with gr.Column(scale=1):
                        batch_output = gr.Textbox(
                            label=get_text("results"),
                            lines=20,
                            show_copy_button=True
                        )
        
        # Обработчики событий
        def change_language(lang):
            global current_language
            current_language = lang
            
            # Return updated text for all components
            return [
                f"""
        # {get_text("header")}
        
        {get_text("subtitle")}
        """,  # header_md
                gr.update(label=get_text("model_selection"), info=get_text("model_info")),  # model_dropdown
                gr.update(label=get_text("language"), info=get_text("language_info")),  # language_dropdown
                gr.update(label=get_text("advanced_params")),  # advanced_accordion
                gr.update(label=get_text("max_tokens"), info=get_text("max_tokens_info")),  # max_tokens_slider
                gr.update(label=get_text("temperature"), info=get_text("temperature_info")),  # temperature_slider
                gr.update(label=get_text("top_p"), info=get_text("top_p_info")),  # top_p_slider
                gr.update(label=get_text("top_k"), info=get_text("top_k_info")),  # top_k_slider
                gr.update(label=get_text("seed"), info=get_text("seed_info")),  # seed_number
                gr.update(value=get_text("random_seed_btn")),  # random_seed_btn
                gr.update(label=get_text("single_processing")),  # single_tab
                gr.update(label=get_text("upload_image")),  # single_image
                gr.update(label=get_text("prompt"), placeholder=get_text("prompt_placeholder")),  # single_prompt
                gr.update(value=get_text("generate_btn")),  # single_submit_btn
                gr.update(label=get_text("result")),  # single_output
                f"### {get_text('examples_title')}",  # examples_title
                gr.update(samples=update_examples()),  # single_examples
                gr.update(label=get_text("batch_processing")),  # batch_tab
                gr.update(label=get_text("upload_images")),  # batch_images
                gr.update(label=get_text("prompts_multiline"), placeholder=get_text("prompts_placeholder"), info=get_text("prompts_info")),  # batch_prompts
                gr.update(value=get_text("process_batch_btn")),  # batch_submit_btn
                gr.update(label=get_text("results")),  # batch_output
            ]
        
        language_dropdown.change(
            fn=change_language,
            inputs=language_dropdown,
            outputs=[
                header_md,
                model_dropdown,
                language_dropdown,
                advanced_accordion,
                max_tokens_slider,
                temperature_slider,
                top_p_slider,
                top_k_slider,
                seed_number,
                random_seed_btn,
                single_tab,
                single_image,
                single_prompt,
                single_submit_btn,
                single_output,
                examples_title,
                single_examples,
                batch_tab,
                batch_images,
                batch_prompts,
                batch_submit_btn,
                batch_output,
            ]
        )

        single_examples.click(
            fn=lambda x: x[0] if x else "",
            inputs=[single_examples],
            outputs=[single_prompt]
        )
        
        random_seed_btn.click(
            fn=random_seed,
            outputs=seed_number
        )
        
        single_submit_btn.click(
            fn=process_single_image,
            inputs=[
                single_image,
                single_prompt,
                model_dropdown,
                max_tokens_slider,
                temperature_slider,
                top_p_slider,
                top_k_slider,
                seed_number
            ],
            outputs=single_output
        )
        
        batch_submit_btn.click(
            fn=process_batch_images,
            inputs=[
                batch_images,
                batch_prompts,
                model_dropdown,
                max_tokens_slider,
                temperature_slider,
                top_p_slider,
                top_k_slider,
                seed_number
            ],
            outputs=batch_output
        )
        
        return demo

# Создаем интерфейс Gradio
demo = create_interface()

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )