Spaces:

openbmb
/

MiniCPM-V-4_5-Demo

Running

App Files Files Community

tc-mb commited on Sep 4

Commit

7997f38

verified ·

1 Parent(s): 165abc3

Upload 5 files

Browse files

Files changed (5) hide show

app.py +2035 -0
logging_util.py +34 -0
models/__init__.py +1 -0
models/minicpmv4_5.py +158 -0
requirements.txt +37 -0

app.py ADDED Viewed

	@@ -0,0 +1,2035 @@

+import os
+import json
+import uuid
+import time
+import copy
+import base64
+import logging
+import argparse
+import math
+import multiprocessing as mp
+from io import BytesIO
+from typing import Generator, Any, Dict, Optional
+import spaces
+import torch
+import gradio as gr
+import numpy as np
+from PIL import Image
+from decord import VideoReader, cpu
+from scipy.spatial import cKDTree
+# import modelscope_studio as mgr
+# 导入模型相关模块
+try:
+    from models import ModelMiniCPMV4_5
+except ImportError:
+    print("Warning: models module not found. Please ensure models.py is available.")
+    class ModelMiniCPMV4_5:
+        def __init__(self, model_path):
+            self.model_path = model_path
+            self.model = None
+        def __call__(self, query):
+            return "Model not loaded", 0
+# 全局配置
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 4.5'
+disable_text_only = False  # 允许纯文本消息，便于测试
+DOUBLE_FRAME_DURATION = 30
+MAX_NUM_FRAMES = 180
+MAX_NUM_PACKING = 3
+TIME_SCALE = 0.1
+IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
+VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
+ENABLE_PARALLEL_ENCODING = True
+PARALLEL_PROCESSES = None
+# 全局模型实例
+global_model = None
+# 日志配置
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# 全局模型配置
+model_config = {
+    'model_path': None,
+    'model_type': None,
+    'instance_id': 0
+}
+# 全局模型缓存（在GPU进程中）
+_gpu_model_cache = None
+def _initialize_gpu_model():
+    """在GPU进程中获取模型并移到GPU"""
+    global _gpu_model_cache
+    if _gpu_model_cache is None:
+        logger.info(f"在GPU进程中初始化模型: {model_config['model_type']}")
+        match model_config['model_type'].lower():
+            case 'minicpmv4_5':
+                _gpu_model_cache = ModelMiniCPMV4_5(model_config['model_path'])
+            case _:
+                raise ValueError(f"Unsupported model type: {model_config['model_type']}")
+        logger.info(f"模型在CPU上初始化完成")
+    # 每次推理时将模型移到GPU
+    if hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'to'):
+        logger.info("将模型移到GPU...")
+        _gpu_model_cache.model.to('cuda')
+    elif hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'model') and hasattr(_gpu_model_cache.model.model, 'to'):
+        logger.info("将模型移到GPU（嵌套模型）...")
+        _gpu_model_cache.model.model.to('cuda')
+    return _gpu_model_cache
+@spaces.GPU
+def gpu_handler(query):
+    """GPU推理处理器"""
+    model = _initialize_gpu_model()
+    res, output_tokens = model({
+        "image": query["image"],
+        "question": query["question"],
+        "params": query.get("params", "{}"),
+        "temporal_ids": query.get("temporal_ids", None)
+    })
+    return {
+        "result": res,
+        "usage": {"output_tokens": output_tokens}
+    }
+@spaces.GPU
+def gpu_stream_handler(query):
+    """GPU流式推理处理器"""
+    model = _initialize_gpu_model()
+    params = json.loads(query.get("params", "{}"))
+    params["stream"] = True
+    query["params"] = json.dumps(params)
+    try:
+        generator = model({
+            "image": query["image"],
+            "question": query["question"],
+            "params": query["params"],
+            "temporal_ids": query.get("temporal_ids", None)
+        })
+        # 收集生成器的所有输出，避免序列化问题
+        full_response = ""
+        for chunk in generator:
+            full_response += chunk
+        return full_response
+    except Exception as e:
+        logger.error(f"GPU stream handler error: {e}")
+        return f"Stream error: {str(e)}"
+class Model:
+    """模型封装类，不持有实际模型对象"""
+    def __init__(self, model_path: str, model_type: str, instance_id: int = 0):
+        self.instance_id = instance_id
+        self.model_path = model_path
+        self.model_type = model_type
+        # 设置全局配置
+        model_config['model_path'] = model_path
+        model_config['model_type'] = model_type
+        model_config['instance_id'] = instance_id
+        logger.info(f"实例 {instance_id}: 配置模型类型 {model_type}")
+        logger.info(f"实例 {instance_id}: 模型路径 {model_path}")
+    def handler(self, query):
+        """非流式推理处理器"""
+        return gpu_handler(query)
+    def stream_handler(self, query):
+        """流式推理处理器"""
+        return gpu_stream_handler(query)
+def initialize_model():
+    """初始化全局模型"""
+    global global_model, _gpu_model_cache
+    # 默认配置
+    model_path = os.getenv('MODEL_PATH', 'openbmb/MiniCPM-V-4_5')
+    model_type = os.getenv('MODEL_TYPE', 'minicpmv4_5')
+    logger.info(f"="*50)
+    logger.info(f"启动MiniCPM-V服务")
+    logger.info(f"模型路径: {model_path}")
+    logger.info(f"模型类型: {model_type}")
+    logger.info(f"="*50)
+    # 创建模型封装类
+    global_model = Model(model_path, model_type, 0)
+    # 在主进程中预加载模型到CPU（可选，为了更快的首次推理）
+    try:
+        logger.info("在主进程中预加载模型到CPU...")
+        match model_type.lower():
+            case 'minicpmv4_5':
+                _gpu_model_cache = ModelMiniCPMV4_5(model_path)
+            case _:
+                raise ValueError(f"Unsupported model type: {model_type}")
+        logger.info("模型在主进程CPU上预加载完成")
+    except Exception as e:
+        logger.warning(f"主进程预加载模型失败，将在GPU进程中加载: {e}")
+        _gpu_model_cache = None
+    return global_model
+# 工具函数
+def get_file_extension(filename):
+    return os.path.splitext(filename)[1].lower()
+def is_image(filename):
+    return get_file_extension(filename) in IMAGE_EXTENSIONS
+def is_video(filename):
+    return get_file_extension(filename) in VIDEO_EXTENSIONS
+def map_to_nearest_scale(values, scale):
+    tree = cKDTree(np.asarray(scale)[:, None])
+    _, indices = tree.query(np.asarray(values)[:, None])
+    return np.asarray(scale)[indices]
+def group_array(arr, size):
+    return [arr[i:i+size] for i in range(0, len(arr), size)]
+def encode_image(image):
+    """编码单张图片"""
+    if not isinstance(image, Image.Image):
+        if hasattr(image, 'path'):
+            image = Image.open(image.path)
+        elif hasattr(image, 'file') and hasattr(image.file, 'path'):
+            image = Image.open(image.file.path)
+        elif hasattr(image, 'name'):
+            image = Image.open(image.name)
+        else:
+            image_path = getattr(image, 'url', getattr(image, 'orig_name', str(image)))
+            image = Image.open(image_path)
+    # 调整图片大小
+    max_size = 448*16
+    if max(image.size) > max_size:
+        w, h = image.size
+        if w > h:
+            new_w = max_size
+            new_h = int(h * max_size / w)
+        else:
+            new_h = max_size
+            new_w = int(w * max_size / h)
+        image = image.resize((new_w, new_h), resample=Image.BICUBIC)
+    # 转换为base64
+    buffered = BytesIO()
+    image.save(buffered, format="png")
+    im_b64 = base64.b64encode(buffered.getvalue()).decode()
+    return [{"type": "image", "pairs": im_b64}]
+def encode_image_parallel(image_data):
+    """并行图片编码包装函数"""
+    try:
+        return encode_image(image_data)
+    except Exception as e:
+        print(f"[Parallel encoding error] Image encoding failed: {e}")
+        return None
+def encode_images_parallel(frames, num_processes=None):
+    """多进程并行图片编码"""
+    if not ENABLE_PARALLEL_ENCODING:
+        print(f"[Parallel encoding] Parallel encoding disabled, using serial processing")
+        encoded_frames = []
+        for frame in frames:
+            encoded = encode_image(frame)
+            if encoded:
+                encoded_frames.extend(encoded)
+        return encoded_frames
+    if num_processes is None:
+        cpu_cores = mp.cpu_count()
+        if PARALLEL_PROCESSES:
+            num_processes = PARALLEL_PROCESSES
+        else:
+            if len(frames) >= 50:
+                num_processes = min(cpu_cores, len(frames), 32)
+            elif len(frames) >= 20:
+                num_processes = min(cpu_cores, len(frames), 16)
+            else:
+                num_processes = min(cpu_cores, len(frames), 8)
+    print(f"[Parallel encoding] Starting parallel encoding of {len(frames)} frame images, using {num_processes} processes")
+    if len(frames) <= 2:
+        print(f"[Parallel encoding] Few images ({len(frames)} frames), using serial processing")
+        encoded_frames = []
+        for frame in frames:
+            encoded = encode_image(frame)
+            if encoded:
+                encoded_frames.extend(encoded)
+        return encoded_frames
+    start_time = time.time()
+    try:
+        with mp.Pool(processes=num_processes) as pool:
+            results = pool.map(encode_image_parallel, frames)
+            encoded_frames = []
+            for result in results:
+                if result:
+                    encoded_frames.extend(result)
+            total_time = time.time() - start_time
+            print(f"[Parallel encoding] Parallel encoding completed, total time: {total_time:.3f}s, encoded {len(encoded_frames)} images")
+            return encoded_frames
+    except Exception as e:
+        print(f"[Parallel encoding] Parallel processing failed, falling back to serial processing: {e}")
+        encoded_frames = []
+        for frame in frames:
+            encoded = encode_image(frame)
+            if encoded:
+                encoded_frames.extend(encoded)
+        return encoded_frames
+def encode_video(video, choose_fps=None):
+    """编码视频文件"""
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    if hasattr(video, 'path'):
+        video_path = video.path
+    elif hasattr(video, 'file') and hasattr(video.file, 'path'):
+        video_path = video.file.path
+    elif hasattr(video, 'name'):
+        video_path = video.name
+    else:
+        video_path = getattr(video, 'url', getattr(video, 'orig_name', str(video)))
+    vr = VideoReader(video_path, ctx=cpu(0))
+    fps = vr.get_avg_fps()
+    video_duration = len(vr) / fps
+    frame_idx = [i for i in range(0, len(vr))]
+    effective_fps = choose_fps if choose_fps else 1
+    if video_duration < DOUBLE_FRAME_DURATION and effective_fps <= 5:
+        effective_fps = effective_fps * 2
+        packing_nums = 2
+        choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
+    elif effective_fps * int(video_duration) <= MAX_NUM_FRAMES:
+        packing_nums = 1
+        choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
+    else:
+        packing_size = math.ceil(video_duration * effective_fps / MAX_NUM_FRAMES)
+        if packing_size <= MAX_NUM_PACKING:
+            choose_frames = round(video_duration * effective_fps)
+            packing_nums = packing_size
+        else:
+            choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
+            packing_nums = MAX_NUM_PACKING
+    choose_idx = choose_frames
+    frame_idx = np.array(uniform_sample(frame_idx, choose_idx))
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frame_idx_ts = frame_idx / fps
+    scale = np.arange(0, video_duration, TIME_SCALE)
+    frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
+    frame_ts_id = frame_ts_id.astype(np.int32)
+    assert len(frames) == len(frame_ts_id)
+    frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
+    frame_ts_id_group = group_array(frame_ts_id.tolist(), packing_nums)
+    print(f"[Performance] Starting image encoding, total {len(frames)} frames")
+    if ENABLE_PARALLEL_ENCODING:
+        print(f"[Image encoding] Using multi-process parallel encoding, CPU cores: {mp.cpu_count()}")
+        encoded_frames = encode_images_parallel(frames, PARALLEL_PROCESSES)
+    else:
+        print("[Warning] Parallel encoding disabled, using serial processing")
+        encoded_frames = []
+        for frame in frames:
+            encoded = encode_image(frame)
+            if encoded:
+                encoded_frames.extend(encoded)
+    return encoded_frames, frame_ts_id_group
+# 响应处理函数
+def parse_thinking_response(response_text):
+    """解析包含<think>标签的响应文本，支持流式解析"""
+    import re
+    # 完整的thinking标签匹配
+    complete_think_pattern = r'<think>(.*?)</think>'
+    thinking_matches = re.findall(complete_think_pattern, response_text, re.DOTALL)
+    if thinking_matches:
+        # 有完整的thinking标签
+        thinking_content = "\n\n".join(thinking_matches).strip()
+        print("thinking_content---:", thinking_content)
+        formal_answer = re.sub(complete_think_pattern, '', response_text, flags=re.DOTALL).strip()
+        return thinking_content, formal_answer
+    else:
+        # 检查是否有未完成的thinking标签
+        partial_think_match = re.search(r'<think>(.*?)$', response_text, re.DOTALL)
+        if partial_think_match:
+            # 有开始标签但没有结束标签，说明thinking内容正在输出中
+            # 返回特殊标识，表示正在thinking过程中
+            return "STREAMING", ""
+        else:
+            # 没有thinking标签，直接返回原文作为正式回答
+            return "", response_text.strip()
+def parse_thinking_response_for_final(response_text):
+    """最终解析thinking响应，用于完成时的格式化"""
+    import re
+    # 首先尝试匹配完整的thinking标签
+    think_pattern = r'<think>(.*?)</think>'
+    thinking_matches = re.findall(think_pattern, response_text, re.DOTALL)
+    if thinking_matches:
+        thinking_content = "\n\n".join(thinking_matches).strip()
+        formal_answer = re.sub(think_pattern, '', response_text, flags=re.DOTALL).strip()
+        print(f"[parse_final] 找到完整thinking标签, thinking长度: {len(thinking_content)}, answer长度: {len(formal_answer)}")
+    else:
+        # 如果没有完整标签，检查是否有未闭合的<think>标签
+        if '<think>' in response_text:
+            think_start = response_text.find('<think>')
+            if think_start != -1:
+                # 提取thinking内容（从<think>之后到字符串结束）
+                thinking_content = response_text[think_start + 7:].strip()  # 跳过<think>
+                # formal_answer是<think>之��的内容
+                formal_answer = response_text[:think_start].strip()
+                # 如果formal_answer为空，说明整个响应都是thinking内容
+                if not formal_answer:
+                    formal_answer = ""  # 没有正式回答
+                print(f"[parse_final] 找到未闭合thinking标签")
+                print(f"[parse_final] thinking内容: '{thinking_content[:100]}...'")
+                print(f"[parse_final] formal_answer: '{formal_answer[:100]}...'")
+            else:
+                thinking_content = ""
+                formal_answer = response_text.strip()
+                print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
+        else:
+            thinking_content = ""
+            formal_answer = response_text.strip()
+            print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
+    return thinking_content, formal_answer
+def normalize_text_for_html(text):
+    """轻量级文本规范化"""
+    import re
+    if not text:
+        return ""
+    text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
+    lines = [line.strip() for line in text.split("\n")]
+    text = "\n".join(lines)
+    text = text.strip()
+    return text
+def format_response_with_thinking(thinking_content, formal_answer):
+    """格式化包含思考过程的响应"""
+    print(f"[format_thinking] thinking_content长度: {len(thinking_content) if thinking_content else 0}")
+    print(f"[format_thinking] formal_answer长度: {len(formal_answer) if formal_answer else 0}")
+    print(f"[format_thinking] thinking_content前100字符: '{thinking_content[:100] if thinking_content else 'None'}...'")
+    print(f"[format_thinking] formal_answer前100字符: '{formal_answer[:100] if formal_answer else 'None'}...'")
+    # 检查内容是否为空
+    if not thinking_content and not formal_answer:
+        print("[format_thinking] 警告：thinking_content和formal_answer都为空！")
+    elif not formal_answer:
+        print("[format_thinking] 警告：formal_answer为空！")
+    elif not thinking_content:
+        print("[format_thinking] 注意：thinking_content为空，将使用简化格式")
+    # 添加一个唯一的ID来强制前端重新渲染
+    import uuid
+    unique_id = uuid.uuid4().hex[:8]
+    # 如果有thinking内容，显示完整的thinking格式
+    if thinking_content and thinking_content.strip():
+        formatted_response = f"""
+<div class="response-container" id="response-{unique_id}">
+<div class="thinking-section">
+<div class="thinking-header">🤔 think</div>
+<div class="thinking-content">{thinking_content}</div>
+</div>
+<div class="formal-section">
+<div class="formal-header">💡 answer</div>
+<div class="formal-content">{formal_answer if formal_answer else '(无正式回答)'}</div>
+</div>
+</div>
+"""
+    else:
+        # 如果没有thinking内容，直接显示回答
+        content_to_show = formal_answer if formal_answer and formal_answer.strip() else "(空回答)"
+        formatted_response = f"""
+<div class="response-container" id="response-{unique_id}">
+<div class="formal-section">
+<div class="formal-content">{content_to_show}</div>
+</div>
+</div>
+"""
+    return "\n" + formatted_response.strip() + "\n"
+def check_mm_type(mm_file):
+    """检查多媒体文件类型"""
+    if hasattr(mm_file, 'path'):
+        path = mm_file.path
+    elif hasattr(mm_file, 'file') and hasattr(mm_file.file, 'path'):
+        path = mm_file.file.path
+    elif hasattr(mm_file, 'name'):
+        path = mm_file.name
+    else:
+        path = getattr(mm_file, 'url', getattr(mm_file, 'orig_name', str(mm_file)))
+    if is_image(path):
+        return "image"
+    if is_video(path):
+        return "video"
+    return None
+def encode_mm_file(mm_file, choose_fps=None):
+    """编码多媒体文件"""
+    if check_mm_type(mm_file) == 'image':
+        return encode_image(mm_file), None
+    if check_mm_type(mm_file) == 'video':
+        encoded_frames, frame_ts_id_group = encode_video(mm_file, choose_fps)
+        return encoded_frames, frame_ts_id_group
+    return None, None
+def encode_message(_question, choose_fps=None):
+    """编码消息"""
+    import re
+    files = _question.files if _question.files else []
+    question = _question.text if _question.text else ""
+    message = []
+    temporal_ids = []
+    # 检查是否使用旧的占位符格式
+    pattern = r"\[mm_media\]\d+\[/mm_media\]"
+    if re.search(pattern, question):
+        # 旧格式：使用占位符
+        matches = re.split(pattern, question)
+        if len(matches) != len(files) + 1:
+            gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
+            # 不使用 assert，而是处理不匹配的情况
+            if len(matches) > len(files) + 1:
+                matches = matches[:len(files) + 1]
+            else:
+                while len(matches) < len(files) + 1:
+                    matches.append("")
+        text = matches[0].strip()
+        if text:
+            message.append({"type": "text", "pairs": text})
+        for i in range(len(files)):
+            encoded_content, frame_ts_id_group = encode_mm_file(files[i], choose_fps)
+            if encoded_content:
+                message += encoded_content
+            if frame_ts_id_group:
+                temporal_ids.extend(frame_ts_id_group)
+            if i + 1 < len(matches):
+                text = matches[i + 1].strip()
+                if text:
+                    message.append({"type": "text", "pairs": text})
+    else:
+        # 新格式：简单的文本 + 文件列表
+        if question.strip():
+            message.append({"type": "text", "pairs": question.strip()})
+        for file in files:
+            encoded_content, frame_ts_id_group = encode_mm_file(file, choose_fps)
+            if encoded_content:
+                message += encoded_content
+            if frame_ts_id_group:
+                temporal_ids.extend(frame_ts_id_group)
+    return message, temporal_ids if temporal_ids else None
+def check_has_videos(_question):
+    """检查是否包含视频"""
+    images_cnt = 0
+    videos_cnt = 0
+    files = _question.files if _question.files else []
+    for file in files:
+        if check_mm_type(file) == "image":
+            images_cnt += 1
+        else:
+            videos_cnt += 1
+    return images_cnt, videos_cnt
+def save_media_to_persistent_cache(_question, session_id):
+    """将图片和视频保存到持久化缓存中，返回保存的路径信息"""
+    import os
+    import shutil
+    import uuid
+    from pathlib import Path
+    files = _question.files if _question.files else []
+    saved_media = []
+    # 创建会话专用的媒体缓存目录
+    cache_dir = Path("./media_cache") / session_id
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    for file in files:
+        file_type = check_mm_type(file)
+        if file_type in ["image", "video"]:
+            try:
+                # 获取原始文件路径
+                original_path = None
+                if hasattr(file, 'name'):
+                    original_path = file.name
+                elif hasattr(file, 'path'):
+                    original_path = file.path
+                elif hasattr(file, 'file') and hasattr(file.file, 'path'):
+                    original_path = file.file.path
+                else:
+                    continue
+                if original_path and os.path.exists(original_path):
+                    # 生成唯一的文件名
+                    file_ext = os.path.splitext(original_path)[1]
+                    prefix = "img" if file_type == "image" else "vid"
+                    unique_filename = f"{prefix}_{uuid.uuid4().hex[:8]}{file_ext}"
+                    cached_path = cache_dir / unique_filename
+                    # 复制文件到缓存目录
+                    shutil.copy2(original_path, cached_path)
+                    saved_media.append({
+                        'type': file_type,
+                        'original_path': original_path,
+                        'cached_path': str(cached_path),
+                        'filename': unique_filename
+                    })
+                    print(f"[save_media_to_persistent_cache] {file_type}已保存: {cached_path}")
+            except Exception as e:
+                print(f"[save_media_to_persistent_cache] 保存{file_type}失败: {e}")
+                continue
+    return saved_media
+def format_user_message_with_files(_question, session_id=None):
+    """格式化包含文件的用户消息，支持图片和视频显示"""
+    user_text = _question.text if _question.text else ""
+    files = _question.files if _question.files else []
+    if not files:
+        return user_text, []
+    # 保存媒体文件到持久化缓存
+    saved_media = []
+    if session_id:
+        saved_media = save_media_to_persistent_cache(_question, session_id)
+    if len(files) == 1:
+        file = files[0]
+        file_type = check_mm_type(file)
+        # 如果是图片或视频且已保存到缓存
+        if file_type in ["image", "video"] and saved_media:
+            media_info = saved_media[0]
+            if file_type == "image":
+                if user_text:
+                    return f"🖼️ {user_text}", saved_media
+                else:
+                    return "🖼️ 图片", saved_media
+            elif file_type == "video":
+                if user_text:
+                    return f"🎬 {user_text}", saved_media
+                else:
+                    return "🎬 视频", saved_media
+        else:
+            # 其他文件类型，使用文本描述
+            return f"[1 file uploaded] {user_text}", saved_media
+    else:
+        # 多个文件，统计不同类型
+        image_count = len([m for m in saved_media if m['type'] == 'image'])
+        video_count = len([m for m in saved_media if m['type'] == 'video'])
+        other_count = len(files) - image_count - video_count
+        # 构建描述文本
+        parts = []
+        if image_count > 0:
+            parts.append(f"{image_count} image{'s' if image_count > 1 else ''}")
+        if video_count > 0:
+            parts.append(f"{video_count} video{'s' if video_count > 1 else ''}")
+        if other_count > 0:
+            parts.append(f"{other_count} other file{'s' if other_count > 1 else ''}")
+        if parts:
+            files_desc = ", ".join(parts)
+            return f"[{files_desc} uploaded] {user_text}", saved_media
+        else:
+            return f"[{len(files)} files uploaded] {user_text}", saved_media
+def update_media_gallery(app_session):
+    """更新媒体画廊显示（图片和视频）"""
+    import os
+    media_cache = app_session.get('media_cache', [])
+    if not media_cache:
+        return gr.update(value=[], visible=False)
+    # 获取所有缓存媒体文件的路径（图片和视频都支持）
+    media_paths = [media_info['cached_path'] for media_info in media_cache if os.path.exists(media_info['cached_path'])]
+    if media_paths:
+        return gr.update(value=media_paths, visible=True)
+    else:
+        return gr.update(value=[], visible=False)
+def format_fewshot_user_message(image_path, user_text):
+    """格式化FewShot用户消息，支持图片显示"""
+    if image_path and user_text:
+        return (user_text, image_path)
+    elif image_path:
+        return ("", image_path)
+    else:
+        return user_text
+# 主要的聊天函数
+def chat_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
+    """直接调用模型进行聊天（非流式）"""
+    default_params = {"num_beams": 3, "repetition_penalty": 1.2, "max_new_tokens": 16284}
+    if params is None:
+        params = default_params
+    use_streaming = params.get('stream', False)
+    if use_streaming:
+        return chat_stream_direct(img_b64, msgs, ctx, params, vision_hidden_states, temporal_ids, session_id)
+    else:
+        # 构建请求数据
+        query = {
+            "image": img_b64,
+            "question": json.dumps(msgs, ensure_ascii=True),
+            "params": json.dumps(params, ensure_ascii=True),
+        }
+        if temporal_ids:
+            query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
+        if session_id:
+            query["session_id"] = session_id
+        try:
+            # 直接调用模型
+            result = global_model.handler(query)
+            raw_result = result['result']
+            # 清理结果
+            import re
+            cleaned_result = re.sub(r'(<box>.*</box>)', '', raw_result)
+            cleaned_result = cleaned_result.replace('<ref>', '')
+            cleaned_result = cleaned_result.replace('</ref>', '')
+            cleaned_result = cleaned_result.replace('<box>', '')
+            cleaned_result = cleaned_result.replace('</box>', '')
+            # 解析思考过程
+            thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
+            thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
+            formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
+            formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
+            context_result = formal_answer_raw if formal_answer_raw else cleaned_result
+            return 0, formatted_result, context_result, None
+        except Exception as e:
+            print(f"Chat error: {e}")
+            import traceback
+            traceback.print_exc()
+            return -1, ERROR_MSG, None, None
+def chat_stream_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
+    """直接调用模型进行流式聊天"""
+    try:
+        # 构建请求数据
+        query = {
+            "image": img_b64,
+            "question": json.dumps(msgs, ensure_ascii=True),
+            "params": json.dumps(params, ensure_ascii=True),
+        }
+        if temporal_ids:
+            query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
+        if session_id:
+            query["session_id"] = session_id
+        # 直接调用流式处理器
+        generator = global_model.stream_handler(query)
+        full_response = ""
+        for chunk in generator:
+            full_response += chunk
+        if not full_response:
+            return -1, ERROR_MSG, None, None
+        # 清理结果
+        import re
+        cleaned_result = re.sub(r'(<box>.*</box>)', '', full_response)
+        cleaned_result = cleaned_result.replace('<ref>', '')
+        cleaned_result = cleaned_result.replace('</ref>', '')
+        cleaned_result = cleaned_result.replace('<box>', '')
+        cleaned_result = cleaned_result.replace('</box>', '')
+        # 解析思考过程
+        thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
+        thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
+        formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
+        formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
+        context_result = formal_answer_raw if formal_answer_raw else cleaned_result
+        return 0, formatted_result, context_result, None
+    except Exception as e:
+        print(f"Stream chat error: {e}")
+        import traceback
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+def chat_stream_character_generator(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, stop_control=None, session_id=None):
+    """字符级流式生成器"""
+    print(f"[chat_stream_character_generator] Starting character-level streaming")
+    print(f"[chat_stream_character_generator] stop_control: {stop_control}")
+    try:
+        # 构建请求数据
+        query = {
+            "image": img_b64,
+            "question": json.dumps(msgs, ensure_ascii=True),
+            "params": json.dumps(params, ensure_ascii=True),
+        }
+        if temporal_ids:
+            query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
+        if session_id:
+            query["session_id"] = session_id
+        # 调用流式处理器 - 现在返回完整响应而不是生成器
+        full_response = global_model.stream_handler(query)
+        # 清理响应
+        import re
+        clean_response = re.sub(r'(<box>.*</box>)', '', full_response)
+        clean_response = clean_response.replace('<ref>', '')
+        clean_response = clean_response.replace('</ref>', '')
+        clean_response = clean_response.replace('<box>', '')
+        clean_response = clean_response.replace('</box>', '')
+        # 逐字符yield以模拟流式输出
+        char_count = 0
+        for char in clean_response:
+            # 检查停止标志
+            if stop_control and stop_control.get('stop_streaming', False):
+                print(f"[chat_stream_character_generator] *** 在第{char_count}个字符处收到停止信号 ***")
+                break
+            char_count += 1
+            if char_count % 10 == 0:
+                print(f"[chat_stream_character_generator] 已输出{char_count}个字符，stop_flag: {stop_control.get('stop_streaming', False) if stop_control else 'None'}")
+            yield char
+            # 添加小延迟以模拟流式效果
+            import time
+            time.sleep(0.01)
+        print(f"[chat_stream_character_generator] 流式输出完成，总共输出{char_count}个字符")
+    except Exception as e:
+        print(f"[chat_stream_character_generator] 异常: {e}")
+        error_msg = f"Stream error: {str(e)}"
+        for char in error_msg:
+            yield char
+# UI组件创建函数
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+    elif comp == 'Checkbox':
+        return gr.Checkbox(
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label'],
+            info=params.get('info', None)
+        )
+def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
+    # 使用标准的 Gradio 组件替代 MultimodalInput，添加预览功能
+    return gr.File(
+        file_count="multiple",
+        file_types=["image", "video"],
+        label="Upload Images/Videos",
+        interactive=not (upload_image_disabled and upload_video_disabled),
+        show_label=True,
+        height=200  # 设置高度以显示预览
+    )
+# UI控制函数
+def update_streaming_mode_state(params_form):
+    """根据解码类型更新流式模式状态"""
+    if params_form == 'Beam Search':
+        return gr.update(value=False, interactive=False, info="Beam Search mode does not support streaming output")
+    else:
+        return gr.update(value=True, interactive=True, info="Enable real-time streaming response")
+def stop_streaming(_app_cfg):
+    """停止流式输出"""
+    _app_cfg['stop_streaming'] = True
+    print(f"[stop_streaming] Set stop flag to True")
+    return _app_cfg
+def reset_stop_flag(_app_cfg):
+    """重置停止标志"""
+    _app_cfg['stop_streaming'] = False
+    print(f"[reset_stop_flag] Reset stop flag to False")
+    return _app_cfg
+def check_and_handle_stop(_app_cfg, context="unknown"):
+    """检查停止标志"""
+    should_stop = _app_cfg.get('stop_streaming', False)
+    is_streaming = _app_cfg.get('is_streaming', False)
+    if should_stop:
+        print(f"[check_and_handle_stop] *** Stop signal detected at {context} ***")
+        print(f"[check_and_handle_stop] stop_streaming: {should_stop}, is_streaming: {is_streaming}")
+        return True
+    return False
+def stop_button_clicked(_app_cfg):
+    """处理停止按钮点击"""
+    print("[stop_button_clicked] *** Stop button clicked ***")
+    print(f"[stop_button_clicked] Current state - is_streaming: {_app_cfg.get('is_streaming', False)}")
+    print(f"[stop_button_clicked] Current state - stop_streaming: {_app_cfg.get('stop_streaming', False)}")
+    _app_cfg['stop_streaming'] = True
+    _app_cfg['is_streaming'] = False
+    print(f"[stop_button_clicked] Set stop_streaming = True, is_streaming = False")
+    return _app_cfg, gr.update(visible=False)
+# 主要的响应函数
+def respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
+    """流式响应生成器"""
+    print(f"[respond_stream] Called with streaming_mode: {streaming_mode}, fps_setting: {fps_setting}")
+    _app_cfg['is_streaming'] = True
+    _app_cfg['stop_streaming'] = False
+    if params_form == 'Beam Search':
+        streaming_mode = False
+        print(f"[respond_stream] Beam Search模式，强制禁用流式模式")
+        _app_cfg['is_streaming'] = False
+    _context = _app_cfg['ctx'].copy()
+    encoded_message, temporal_ids = encode_message(_question, fps_setting)
+    _context.append({'role': 'user', 'contents': encoded_message})
+    images_cnt = _app_cfg['images_cnt']
+    videos_cnt = _app_cfg['videos_cnt']
+    files_cnts = check_has_videos(_question)
+    if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
+        gr.Warning("Only supports single video file input right now!")
+        yield create_multimodal_input(True, True), _chat_bot, _app_cfg, gr.update(visible=False)
+        return
+    if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
+        gr.Warning("Please chat with at least one image or video.")
+        yield create_multimodal_input(False, False), _chat_bot, _app_cfg, gr.update(visible=False)
+        return
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 16284,
+            "enable_thinking": thinking_mode,
+            "stream": False
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.03,
+            "max_new_tokens": 16284,
+            "enable_thinking": thinking_mode,
+            "stream": streaming_mode
+        }
+    if files_cnts[1] + videos_cnt > 0:
+        params["max_inp_length"] = 2048 * 10
+        params["use_image_id"] = False
+        params["max_slice_nums"] = 1
+    images_cnt += files_cnts[0]
+    videos_cnt += files_cnts[1]
+    # 构建用户消息显示（流式模式）
+    user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
+    # 将媒体信息保存到会话状态中
+    if saved_images:
+        if 'media_cache' not in _app_cfg:
+            _app_cfg['media_cache'] = []
+        _app_cfg['media_cache'].extend(saved_images)
+    _chat_bot.append((user_message, ""))
+    _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
+    gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
+    upload_image_disabled = videos_cnt > 0
+    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+    yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
+    print(f"[respond_stream] 开始字符级流式输出循环")
+    char_count = 0
+    accumulated_content = ""
+    for _char in gen:
+        char_count += 1
+        if check_and_handle_stop(_app_cfg, f"字符{char_count}"):
+            break
+        accumulated_content += _char
+        _context[-1]["contents"][0]["pairs"] += _char
+        # 实时显示内容（thinking模式也实时显示）
+        if thinking_mode:
+            # 尝试解析当前累积的内容
+            thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
+            # 如果解析出了完整的thinking内容，使用格式化显示
+            if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
+                thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
+                formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
+                formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
+                _chat_bot[-1] = (user_message, formatted_display)
+            else:
+                # 正在thinking过程中或者还没有完整标签，直接显示原始内容（实时流式）
+                _chat_bot[-1] = (user_message, accumulated_content)
+        else:
+            # 非thinking模式，直接显示累积内容
+            _chat_bot[-1] = (user_message, accumulated_content)
+        if char_count % 5 == 0:  # 更频繁的更新以提供更好的流式体验
+            print(f"[respond_stream] 已处理{char_count}个字符，stop_flag: {_app_cfg.get('stop_streaming', False)}")
+            yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
+            time.sleep(0.02)  # 稍微增加延迟以避免过于频繁的更新
+        else:
+            yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
+    if _app_cfg.get('stop_streaming', False):
+        print("[respond_stream] 流式输出已停止")
+    # 最终处理thinking格式化
+    final_content = accumulated_content
+    if thinking_mode:
+        thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(final_content)
+        thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
+        formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
+        formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
+        _chat_bot[-1] = (user_message, formatted_result)
+        _context[-1]["contents"][0]["pairs"] = formal_answer_raw if formal_answer_raw else final_content
+    else:
+        _chat_bot[-1] = (user_message, final_content)
+        _context[-1]["contents"][0]["pairs"] = final_content
+    _app_cfg['ctx'] = _context
+    _app_cfg['images_cnt'] = images_cnt
+    _app_cfg['videos_cnt'] = videos_cnt
+    _app_cfg['is_streaming'] = False
+    upload_image_disabled = videos_cnt > 0
+    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+    yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
+def respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
+    """主响应函数"""
+    if 'session_id' not in _app_cfg:
+        _app_cfg['session_id'] = uuid.uuid4().hex[:16]
+        print(f"[会话] 为现有会话生成session_id: {_app_cfg['session_id']}")
+    # 记录thinking模式状态变化
+    prev_thinking_mode = _app_cfg.get('last_thinking_mode', False)
+    _app_cfg['last_thinking_mode'] = thinking_mode
+    if prev_thinking_mode != thinking_mode:
+        print(f"[respond] Thinking模式切换: {prev_thinking_mode} -> {thinking_mode}")
+        # 强制清理可能的缓存状态
+        if hasattr(_app_cfg, 'thinking_cache'):
+            del _app_cfg['thinking_cache']
+        # 添加额外的状态重置
+        if thinking_mode and not prev_thinking_mode:
+            print("[respond] 启用thinking模式，重置相关状态")
+            _app_cfg['thinking_enabled'] = True
+        elif not thinking_mode and prev_thinking_mode:
+            print("[respond] 禁用thinking模式")
+            _app_cfg['thinking_enabled'] = False
+    if params_form == 'Beam Search':
+        streaming_mode = False
+        print(f"[respond] Beam Search模式，强制禁用流式模式")
+    if streaming_mode:
+        print("[respond] 选择流式模式")
+        yield from respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting)
+        return
+    # 非流式模式
+    _context = _app_cfg['ctx'].copy()
+    encoded_message, temporal_ids = encode_message(_question, fps_setting)
+    _context.append({'role': 'user', 'contents': encoded_message})
+    images_cnt = _app_cfg['images_cnt']
+    videos_cnt = _app_cfg['videos_cnt']
+    files_cnts = check_has_videos(_question)
+    if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
+        gr.Warning("Only supports single video file input right now!")
+        upload_image_disabled = videos_cnt > 0
+        upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+        yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
+        return
+    if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
+        gr.Warning("Please chat with at least one image or video.")
+        upload_image_disabled = videos_cnt > 0
+        upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+        yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
+        return
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 16284,
+            "enable_thinking": thinking_mode,
+            "stream": False
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.03,
+            "max_new_tokens": 16284,
+            "enable_thinking": thinking_mode,
+            "stream": False
+        }
+    if files_cnts[1] + videos_cnt > 0:
+        params["max_inp_length"] = 2048 * 10
+        params["use_image_id"] = False
+        params["max_slice_nums"] = 1
+    # 调用聊天函数
+    code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
+    images_cnt += files_cnts[0]
+    videos_cnt += files_cnts[1]
+    if code == 0:
+        context_content = _context_answer if _context_answer else _answer
+        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
+        # 根据thinking_mode决定是否应用thinking格式化
+        if thinking_mode:
+            thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(_answer)
+            thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
+            formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
+            print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, thinking_content: '{thinking_content_raw[:50]}...'")
+            formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
+        else:
+            print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, 使用原始回答")
+            formatted_result = _answer
+        # 构建用户消息显示
+        user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
+        # 将媒体信息保存到会话状态中
+        if saved_images:
+            if 'media_cache' not in _app_cfg:
+                _app_cfg['media_cache'] = []
+            _app_cfg['media_cache'].extend(saved_images)
+        _chat_bot.append((user_message, formatted_result))
+        _app_cfg['ctx'] = _context
+        _app_cfg['sts'] = sts
+    else:
+        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": "Error occurred during processing"}]})
+        # 构建用户消息显示（错误情况）
+        user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
+        # 将媒体信息保存到会话状态中
+        if saved_images:
+            if 'media_cache' not in _app_cfg:
+                _app_cfg['media_cache'] = []
+            _app_cfg['media_cache'].extend(saved_images)
+        _chat_bot.append((user_message, "Error occurred during processing"))
+    _app_cfg['images_cnt'] = images_cnt
+    _app_cfg['videos_cnt'] = videos_cnt
+    _app_cfg['is_streaming'] = False
+    upload_image_disabled = videos_cnt > 0
+    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+    # 统一使用yield返回结果，确保与流式模式兼容
+    yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
+# FewShot相关函数
+def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
+    if 'session_id' not in _app_cfg:
+        _app_cfg['session_id'] = uuid.uuid4().hex[:16]
+        print(f"[会话] 为FewShot示例生成session_id: {_app_cfg['session_id']}")
+    ctx = _app_cfg["ctx"]
+    # 构建用户消息
+    user_msg = ""
+    if _image is not None:
+        image = Image.open(_image).convert("RGB")
+        ctx.append({"role": "user", "contents": [
+            *encode_image(image),
+            {"type": "text", "pairs": _user_message}
+        ]})
+        user_msg = f"[Image uploaded] {_user_message}"
+    else:
+        if _user_message:
+            ctx.append({"role": "user", "contents": [{"type": "text", "pairs": _user_message}]})
+            user_msg = _user_message
+    # 构建助手消息
+    if _assistant_message:
+        ctx.append({"role": "assistant", "contents": [{"type": "text", "pairs": _assistant_message}]})
+    # 只有当用户消息和助手消息都存在时才添加到聊天记录
+    if user_msg and _assistant_message:
+        formatted_user_msg = format_fewshot_user_message(_image, _user_message) if _image else user_msg
+        _chat_bot.append([formatted_user_msg, _assistant_message])
+    return None, "", "", _chat_bot, _app_cfg
+def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
+    """FewShot响应函数"""
+    print(f"[fewshot_respond] Called with streaming_mode: {streaming_mode}")
+    if 'session_id' not in _app_cfg:
+        _app_cfg['session_id'] = uuid.uuid4().hex[:16]
+        print(f"[会话] 为FewShot会话生成session_id: {_app_cfg['session_id']}")
+    if params_form == 'Beam Search':
+        streaming_mode = False
+        print(f"[fewshot_respond] Beam Search模式，强制禁用流式模式")
+    user_message_contents = []
+    _context = _app_cfg["ctx"].copy()
+    images_cnt = _app_cfg["images_cnt"]
+    temporal_ids = None
+    if _image:
+        image = Image.open(_image).convert("RGB")
+        user_message_contents += encode_image(image)
+        images_cnt += 1
+    if _user_message:
+        user_message_contents += [{"type": "text", "pairs": _user_message}]
+    if user_message_contents:
+        _context.append({"role": "user", "contents": user_message_contents})
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 16284,
+            "enable_thinking": thinking_mode,
+            "stream": False
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.03,
+            "max_new_tokens": 16284,
+            "enable_thinking": thinking_mode,
+            "stream": streaming_mode
+        }
+    if disable_text_only and images_cnt == 0:
+        gr.Warning("Please chat with at least one image or video.")
+        yield _image, _user_message, '', _chat_bot, _app_cfg
+        return
+    if streaming_mode:
+        print(f"[fewshot_respond] Using streaming mode")
+        _app_cfg['is_streaming'] = True
+        _app_cfg['stop_streaming'] = False
+        if _image:
+            user_msg = format_fewshot_user_message(_image, _user_message)
+            _chat_bot.append([user_msg, ""])
+        else:
+            _chat_bot.append([_user_message, ""])
+        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
+        _app_cfg['stop_streaming'] = False
+        gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
+        yield _image, _user_message, '', _chat_bot, _app_cfg
+        accumulated_content = ""
+        for _char in gen:
+            if _app_cfg.get('stop_streaming', False):
+                print("[fewshot_respond] 收到停止信号，中断流式响应")
+                break
+            accumulated_content += _char
+            _context[-1]["contents"][0]["pairs"] += _char
+            # 实时解析和格式化thinking内容
+            if thinking_mode:
+                # 尝试解析当前累积的内容
+                thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
+                # 如果解析出了完整的thinking内容，使用格式化显示
+                if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
+                    thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
+                    formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
+                    formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
+                    _chat_bot[-1] = (_chat_bot[-1][0], formatted_display)
+                else:
+                    # 正在thinking过程中或者还没有完整标签，直接显示原始内容（实时流式）
+                    _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
+            else:
+                # 非thinking模式，直接显示累积内容
+                _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
+            yield _image, _user_message, '', _chat_bot, _app_cfg
+        final_content = _context[-1]["contents"][0]["pairs"]
+        _app_cfg['ctx'] = _context
+        _app_cfg['images_cnt'] = images_cnt
+        _app_cfg['is_streaming'] = False
+        yield _image, '', '', _chat_bot, _app_cfg
+    else:
+        # 非流式模式
+        code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
+        context_content = _context_answer if _context_answer else _answer
+        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
+        if _image:
+            user_msg = format_fewshot_user_message(_image, _user_message)
+            _chat_bot.append([user_msg, _answer])
+        else:
+            _chat_bot.append([_user_message, _answer])
+        if code == 0:
+            _app_cfg['ctx'] = _context
+            _app_cfg['sts'] = sts
+            _app_cfg['images_cnt'] = images_cnt
+        _app_cfg['is_streaming'] = False
+        yield None, '', '', _chat_bot, _app_cfg
+# 其他UI函数
+def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
+    print(f"[regenerate] streaming_mode: {streaming_mode}")
+    print(f"[regenerate] thinking_mode: {thinking_mode}")
+    print(f"[regenerate] chat_type: {_app_cfg.get('chat_type', 'unknown')}")
+    if params_form == 'Beam Search':
+        streaming_mode = False
+        print(f"[regenerate] Beam Search模式，强制禁用流式模式")
+    if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
+        gr.Warning('No question for regeneration.')
+        yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+        return
+    if _app_cfg["chat_type"] == "Chat":
+        images_cnt = _app_cfg['images_cnt']
+        videos_cnt = _app_cfg['videos_cnt']
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+        files_cnts = check_has_videos(_question)
+        images_cnt -= files_cnts[0]
+        videos_cnt -= files_cnts[1]
+        _app_cfg['images_cnt'] = images_cnt
+        _app_cfg['videos_cnt'] = videos_cnt
+        print(f"[regenerate] About to call respond with streaming_mode: {streaming_mode}")
+        for result in respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
+            new_input, _chat_bot, _app_cfg, _stop_button = result
+            _question = new_input
+            yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+    else:
+        # 在 tuples 格式下，_chat_bot[-1][0] 是字符串
+        last_user_message = _chat_bot[-1][0]
+        last_image = None
+        # 检查消息是否包含图片标识
+        if "[Image uploaded]" in last_user_message:
+            # 从消息中提取实际的用户消息
+            last_user_message = last_user_message.replace("[Image uploaded] ", "")
+            # 注意：在简化的 tuples 格式下，我们无法直接获取图片文件
+            # 这里需要根据实际需要进行处理
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+        print(f"[regenerate] About to call fewshot_respond with streaming_mode: {streaming_mode}")
+        for result in fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
+            _image, _user_message, _assistant_message, _chat_bot, _app_cfg = result
+            yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+def flushed():
+    return gr.update(interactive=True)
+def clear_media_cache(session_id):
+    """清理指定会话的媒体缓存"""
+    import shutil
+    from pathlib import Path
+    try:
+        cache_dir = Path("./media_cache") / session_id
+        if cache_dir.exists():
+            shutil.rmtree(cache_dir)
+            print(f"[clear_media_cache] 已清理会话 {session_id} 的媒体缓存")
+    except Exception as e:
+        print(f"[clear_media_cache] 清理缓存失败: {e}")
+def clear(txt_input, file_upload, chat_bot, app_session):
+    # 清理旧会话的媒体缓存
+    if 'session_id' in app_session:
+        clear_media_cache(app_session['session_id'])
+    chat_bot = copy.deepcopy(init_conversation)
+    app_session['sts'] = None
+    app_session['ctx'] = []
+    app_session['images_cnt'] = 0
+    app_session['videos_cnt'] = 0
+    app_session['stop_streaming'] = False
+    app_session['is_streaming'] = False
+    app_session['media_cache'] = []  # 清空媒体缓存信息
+    app_session['last_thinking_mode'] = False  # 重置thinking模式状态
+    app_session['session_id'] = uuid.uuid4().hex[:16]
+    print(f"[会话] 生成新会话ID: {app_session['session_id']}")
+    return "", None, gr.update(value=[], visible=False), gr.update(value=[], visible=False), chat_bot, app_session, None, '', ''
+def select_chat_type(_tab, _app_cfg):
+    _app_cfg["chat_type"] = _tab
+    return _app_cfg
+# UI配置
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+thinking_checkbox = {
+    'value': False,
+    'interactive': True,
+    'label': 'Enable Thinking Mode',
+}
+streaming_checkbox = {
+    'value': True,
+    'interactive': True,
+    'label': 'Enable Streaming Mode',
+}
+fps_slider = {
+    'minimum': 1,
+    'maximum': 20,
+    'value': 3,
+    'step': 1,
+    'interactive': True,
+    'label': 'Custom FPS for Video Processing'
+}
+init_conversation = [
+    ["", "You can talk to me now"]
+]
+css = """
+video { height: auto !important; }
+.example label { font-size: 16px;}
+/* Current Media Gallery 滚动条样式 - 使用class选择器更安全 */
+.current-media-gallery {
+    overflow-y: auto !important;
+    max-height: 600px !important;
+    position: relative !important;
+}
+/* 确保只影响特定的Gallery容器内部 */
+.current-media-gallery > div,
+.current-media-gallery .gallery-container {
+    overflow-y: auto !important;
+    max-height: 580px !important;
+}
+.current-media-gallery .gallery-item {
+    margin-bottom: 10px !important;
+}
+/* 只为Current Media Gallery自定义滚动条样式 */
+.current-media-gallery::-webkit-scrollbar,
+.current-media-gallery > div::-webkit-scrollbar,
+.current-media-gallery .gallery-container::-webkit-scrollbar {
+    width: 8px !important;
+}
+.current-media-gallery::-webkit-scrollbar-track,
+.current-media-gallery > div::-webkit-scrollbar-track,
+.current-media-gallery .gallery-container::-webkit-scrollbar-track {
+    background: #f1f1f1 !important;
+    border-radius: 4px !important;
+}
+.current-media-gallery::-webkit-scrollbar-thumb,
+.current-media-gallery > div::-webkit-scrollbar-thumb,
+.current-media-gallery .gallery-container::-webkit-scrollbar-thumb {
+    background: #c1c1c1 !important;
+    border-radius: 4px !important;
+}
+.current-media-gallery::-webkit-scrollbar-thumb:hover,
+.current-media-gallery > div::-webkit-scrollbar-thumb:hover,
+.current-media-gallery .gallery-container::-webkit-scrollbar-thumb:hover {
+    background: #a8a8a8 !important;
+}
+/* 隐藏Current Media的不必要元素 */
+.current-media-gallery .upload-container,
+.current-media-gallery .drop-zone,
+.current-media-gallery .file-upload,
+.current-media-gallery .upload-text,
+.current-media-gallery .drop-text {
+    display: none !important;
+}
+.current-media-gallery .clear-button,
+.current-media-gallery .delete-button,
+.current-media-gallery .remove-button {
+    display: none !important;
+}
+/* 当Gallery为空时隐藏标签和占位文本 */
+.current-media-gallery:not([style*="display: none"]) .gallery-container:empty::after {
+    content: "";
+    display: none;
+}
+.current-media-gallery .empty-gallery-text,
+.current-media-gallery .placeholder-text {
+    display: none !important;
+}
+/* 确保滚动条不会影响到其他组件 */
+.current-media-gallery {
+    isolation: isolate !important;
+}
+/* 重置其他Gallery组件的滚动条样式，防止被污染 */
+.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar {
+    width: initial !important;
+}
+.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-track {
+    background: initial !important;
+    border-radius: initial !important;
+}
+.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-thumb {
+    background: initial !important;
+    border-radius: initial !important;
+}
+/* 确保chatbot不受影响 */
+.thinking-chatbot::-webkit-scrollbar {
+    width: initial !important;
+}
+.thinking-chatbot::-webkit-scrollbar-track {
+    background: initial !important;
+}
+.thinking-chatbot::-webkit-scrollbar-thumb {
+    background: initial !important;
+}
+/* 思考过程和正式回答的样式 */
+.response-container {
+    margin: 10px 0;
+}
+.thinking-section {
+    background: linear-gradient(135deg, #f8f9ff 0%, #f0f4ff 100%);
+    border: 1px solid #d1d9ff;
+    border-radius: 12px;
+    padding: 16px;
+    margin-bottom: 0px;
+    box-shadow: 0 2px 8px rgba(67, 90, 235, 0.1);
+}
+.thinking-header {
+    font-weight: 600;
+    color: #4c5aa3;
+    font-size: 14px;
+    margin-bottom: 12px;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.thinking-content {
+    color: #5a6ba8;
+    font-size: 13px;
+    line-height: 1;
+    font-style: italic;
+    background: rgba(255, 255, 255, 0.6);
+    padding: 12px;
+    border-radius: 8px;
+    border-left: 3px solid #4c5aa3;
+    white-space: pre-wrap;
+}
+.formal-section {
+    background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%);
+    border: 1px solid #e9ecef;
+    border-radius: 12px;
+    padding: 16px;
+    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
+}
+.formal-header {
+    font-weight: 600;
+    color: #28a745;
+    font-size: 14px;
+    margin-bottom: 12px;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.formal-content {
+    color: #333;
+    font-size: 14px;
+    line-height: 1;
+    white-space: pre-wrap;
+}
+/* 聊天机器人容器样式 */
+.thinking-chatbot .message {
+    border-radius: 12px;
+    overflow: visible;
+    margin-top: 0 !important;
+    margin-bottom: 0 !important;
+}
+.thinking-chatbot .message-wrap {
+    margin-top: 0 !important;
+    margin-bottom: 0 !important;
+}
+.thinking-chatbot .message.bot {
+    background: transparent !important;
+    border: none !important;
+    padding: 8px !important;
+}
+.thinking-chatbot .message.bot .content {
+    background: transparent !important;
+}
+"""
+introduction = """
+## Features:
+1. Chat with single image
+2. Chat with multiple images
+3. Chat with video
+4. Streaming Mode: Real-time response streaming
+5. Thinking Mode: Show model reasoning process
+Click `How to use` tab to see examples.
+"""
+# 主应用
+def create_app():
+    with gr.Blocks(css=css) as demo:
+        with gr.Tab(model_name):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=300):
+                    gr.Markdown(value=introduction)
+                    params_form = create_component(form_radio, comp='Radio')
+                    thinking_mode = create_component(thinking_checkbox, comp='Checkbox')
+                    streaming_mode = create_component(streaming_checkbox, comp='Checkbox')
+                    fps_setting = create_component(fps_slider, comp='Slider')
+                    regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+                    clear_button = create_component({'value': 'Clear History'}, comp='Button')
+                    stop_button = gr.Button("Stop", visible=False)
+                with gr.Column(scale=3, min_width=500):
+                    initial_session_id = uuid.uuid4().hex[:16]
+                    print(f"[会话] 初始化会话，生成session_id: {initial_session_id}")
+                    app_session = gr.State({
+                        'sts': None, 'ctx': [], 'images_cnt': 0, 'videos_cnt': 0,
+                        'chat_type': 'Chat', 'stop_streaming': False, 'is_streaming': False,
+                        'session_id': initial_session_id, 'media_cache': [], 'last_thinking_mode': False
+                    })
+                    with gr.Row():
+                        with gr.Column(scale=4):
+                            chat_bot = gr.Chatbot(
+                                label=f"Chat with {model_name}",
+                                value=copy.deepcopy(init_conversation),
+                                height=600,
+                                elem_classes="thinking-chatbot"
+                            )
+                        with gr.Column(scale=1, min_width=200):
+                            current_images = gr.Gallery(
+                                label="Current Media",
+                                show_label=True,
+                                elem_id="current_media",
+                                elem_classes="current-media-gallery",
+                                columns=1,
+                                rows=1,  # 设为1行，让内容可以垂直滚动
+                                height=600,
+                                visible=False,
+                                container=True,  # 启用容器模式
+                                allow_preview=True,  # 允许预览
+                                show_download_button=False,  # 隐藏下载按钮
+                                interactive=False,  # 禁用交互，防止用户上传/删除
+                                show_share_button=False  # 隐藏分享按钮
+                            )
+                    with gr.Tab("Chat") as chat_tab:
+                        chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
+                        with gr.Row():
+                            with gr.Column(scale=4):
+                                txt_input = gr.Textbox(
+                                    placeholder="Type your message here...",
+                                    label="Message",
+                                    lines=2
+                                )
+                            with gr.Column(scale=1):
+                                submit_btn = gr.Button("Submit", variant="primary")
+                        with gr.Row():
+                            with gr.Column():
+                                file_upload = create_multimodal_input()
+                                # 添加图片预览组件
+                                file_preview = gr.Gallery(
+                                    label="Uploaded Files Preview",
+                                    show_label=True,
+                                    elem_id="file_preview",
+                                    columns=3,
+                                    rows=2,
+                                    height="auto",
+                                    visible=False
+                                )
+                        # 添加文件上传时的预览更新
+                        def update_file_preview(files):
+                            if files:
+                                # 过滤出图片文件进行预览
+                                image_files = []
+                                for file in files:
+                                    if hasattr(file, 'name'):
+                                        file_path = file.name
+                                    else:
+                                        file_path = str(file)
+                                    # 检查是否是图片文件
+                                    if any(file_path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']):
+                                        image_files.append(file_path)
+                                if image_files:
+                                    return gr.update(value=image_files, visible=True)
+                            return gr.update(value=[], visible=False)
+                        file_upload.change(
+                            update_file_preview,
+                            inputs=[file_upload],
+                            outputs=[file_preview]
+                        )
+                        # 创建一个包装函数来处理新的输入格式
+                        def handle_submit(message, files, chat_bot, current_images_gallery, app_session, params_form, thinking_mode, streaming_mode, fps_setting):
+                            print(f"[handle_submit] 收到输入: message='{message}', files={files}, chat_bot长度={len(chat_bot)}")
+                            # 如果消息为空且没有文件，直接返回
+                            if not message and not files:
+                                print("[handle_submit] 消息和文件都为空，直接返回")
+                                return message, files, chat_bot, current_images_gallery, app_session, gr.update(visible=False)
+                            # 模拟原来的 MultimodalInput 格式
+                            class MockInput:
+                                def __init__(self, text, files):
+                                    self.text = text
+                                    self.files = files if files else []
+                            mock_question = MockInput(message, files)
+                            print(f"[handle_submit] 创建MockInput: text='{mock_question.text}', files={len(mock_question.files)}")
+                            # respond 函数返回生成器，我们需要逐步yield结果
+                            result_generator = respond(mock_question, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting)
+                            # 如果是生成器，逐步yield
+                            if hasattr(result_generator, '__iter__') and not isinstance(result_generator, (str, bytes, tuple)):
+                                print("[handle_submit] 使用生成器模式")
+                                for result in result_generator:
+                                    new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result
+                                    print(f"[handle_submit] yield结果: chat_bot长度={len(updated_chat_bot)}")
+                                    # 更新媒体显示
+                                    media_gallery_update = update_media_gallery(updated_app_session)
+                                    # 返回正确的输出格式
+                                    yield "", None, updated_chat_bot, media_gallery_update, updated_app_session, stop_btn_update
+                            else:
+                                print("[handle_submit] 使用非生成器模式")
+                                # 如果不是生成器，直接返回
+                                new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result_generator
+                                print(f"[handle_submit] 直接返回结果: chat_bot长度={len(updated_chat_bot)}")
+                                # 更新图片显示
+                                image_gallery_update = update_image_gallery(updated_app_session)
+                                yield "", None, updated_chat_bot, image_gallery_update, updated_app_session, stop_btn_update
+                        submit_btn.click(
+                            handle_submit,
+                            [txt_input, file_upload, chat_bot, current_images, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
+                            [txt_input, file_upload, chat_bot, current_images, app_session, stop_button]
+                        )
+                    with gr.Tab("Few Shot", visible=False) as fewshot_tab:
+                        fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                image_input = gr.Image(type="filepath", sources=["upload"])
+                            with gr.Column(scale=3):
+                                user_message = gr.Textbox(label="User")
+                                assistant_message = gr.Textbox(label="Assistant")
+                                with gr.Row():
+                                    add_demonstration_button = gr.Button("Add Example")
+                                    generate_button = gr.Button(value="Generate", variant="primary")
+                        add_demonstration_button.click(
+                            fewshot_add_demonstration,
+                            [image_input, user_message, assistant_message, chat_bot, app_session],
+                            [image_input, user_message, assistant_message, chat_bot, app_session]
+                        )
+                        generate_button.click(
+                            fewshot_respond,
+                            [image_input, user_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
+                            [image_input, user_message, assistant_message, chat_bot, app_session]
+                        )
+                    chat_tab.select(
+                        select_chat_type,
+                        [chat_tab_label, app_session],
+                        [app_session]
+                    )
+                    chat_tab.select(
+                        clear,
+                        [txt_input, file_upload, chat_bot, app_session],
+                        [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
+                    )
+                    fewshot_tab.select(
+                        select_chat_type,
+                        [fewshot_tab_label, app_session],
+                        [app_session]
+                    )
+                    fewshot_tab.select(
+                        clear,
+                        [txt_input, file_upload, chat_bot, app_session],
+                        [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
+                    )
+                    # chat_bot.flushed(flushed, outputs=[txt_input])  # 标准 Chatbot 可能不支持 flushed
+                    params_form.change(
+                        update_streaming_mode_state,
+                        inputs=[params_form],
+                        outputs=[streaming_mode]
+                    )
+                    regenerate.click(
+                        regenerate_button_clicked,
+                        [txt_input, image_input, user_message, assistant_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
+                        [txt_input, image_input, user_message, assistant_message, chat_bot, app_session]
+                    )
+                    clear_button.click(
+                        clear,
+                        [txt_input, file_upload, chat_bot, app_session],
+                        [txt_input, file_upload, file_preview, current_images, chat_bot, app_session, image_input, user_message, assistant_message]
+                    )
+                    stop_button.click(
+                        stop_button_clicked,
+                        [app_session],
+                        [app_session, stop_button]
+                    )
+    return demo
+if __name__ == "__main__":
+    # 解析命令行参数
+    parser = argparse.ArgumentParser(description='Web Demo for MiniCPM-V 4.5')
+    parser.add_argument('--port', type=int, default=7860, help='Port to run the web demo on')
+    parser.add_argument('--no-parallel-encoding', action='store_true', help='Disable parallel image encoding')
+    parser.add_argument('--parallel-processes', type=int, default=None, help='Number of parallel processes for image encoding')
+    args = parser.parse_args()
+    # 配置并行编码
+    if args.no_parallel_encoding:
+        ENABLE_PARALLEL_ENCODING = False
+        print("[性能优化] 并行图像编码已禁用")
+    else:
+        ENABLE_PARALLEL_ENCODING = True
+        print("[性能优化] 并行图像编码已启用")
+    if args.parallel_processes:
+        PARALLEL_PROCESSES = args.parallel_processes
+        print(f"[性能优化] 设置并行进程数为: {PARALLEL_PROCESSES}")
+    else:
+        print(f"[性能优化] 自动检测并行进程数，CPU核心数: {mp.cpu_count()}")
+    # 初始化模型
+    initialize_model()
+    # 创建并启动应用
+    demo = create_app()
+    demo.launch(
+        share=False,
+        debug=True,
+        show_api=False,
+        server_port=args.port,
+        server_name="0.0.0.0"
+    )

logging_util.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import logging
+import sys
+import os
+# setup root logger
+def setup_root_logger(log_level=logging.INFO, dist_rank=0, local_dir=''):
+    """
+        log_level: logging level
+        dist_rank: process rank for distributed training
+        local_dir: local log path, default None
+    """
+    logger = logging.getLogger() # setup root logger for all
+    for handler in logger.handlers:
+        logger.removeHandler(handler)
+    # create formatter
+    fmt = '[%(asctime)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+    #color_fmt = colored('[%(asctime)s]', 'green') + \
+    #        colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
+    # create console handlers for master process
+    if dist_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+    # create file handlers
+    if local_dir:
+        os.makedirs(local_dir, exist_ok=True)
+        file_handler = logging.FileHandler(os.path.join(local_dir, f'log_rank{dist_rank}.log'), mode='a')
+        file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(file_handler)
+    logger.setLevel(log_level)

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .minicpmv4_5 import ModelMiniCPMV4_5

models/minicpmv4_5.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import spaces
+from io import BytesIO
+import torch
+from PIL import Image
+import base64
+import json
+import re
+import logging
+from transformers import AutoModel, AutoTokenizer, AutoProcessor, set_seed
+# set_seed(42)
+logger = logging.getLogger(__name__)
+class ModelMiniCPMV4_5:
+    def __init__(self, path) -> None:
+        self.model = AutoModel.from_pretrained(
+            path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16, device_map="auto")
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            path, trust_remote_code=True)
+        self.processor = AutoProcessor.from_pretrained(
+            path, trust_remote_code=True)
+    def __call__(self, input_data):
+        image = None
+        if "image" in input_data and len(input_data["image"]) > 10:
+            image = Image.open(BytesIO(base64.b64decode(
+                input_data["image"]))).convert('RGB')
+        msgs = input_data["question"]
+        params = input_data.get("params", "{}")
+        params = json.loads(params)
+        msgs = json.loads(msgs)
+        temporal_ids = input_data.get("temporal_ids", None)
+        if temporal_ids:
+            temporal_ids = json.loads(temporal_ids)
+        if params.get("max_new_tokens", 0) > 16384:
+            logger.info(f"make max_new_tokens=16384, reducing limit to save memory")
+            params["max_new_tokens"] = 16384
+        if params.get("max_inp_length", 0) > 2048 * 10:
+            logger.info(f"make max_inp_length={2048 * 10}, keeping high limit for video processing")
+            params["max_inp_length"] = 2048 * 10
+        for msg in msgs:
+            if 'content' in msg:
+                contents = msg['content']
+            else:
+                contents = msg.pop('contents')
+            new_cnts = []
+            for c in contents:
+                if isinstance(c, dict):
+                    if c['type'] == 'text':
+                        c = c['pairs']
+                    elif c['type'] == 'image':
+                        c = Image.open(
+                            BytesIO(base64.b64decode(c["pairs"]))).convert('RGB')
+                    else:
+                        raise ValueError(
+                            "contents type only support text and image.")
+                new_cnts.append(c)
+            msg['content'] = new_cnts
+        logger.info(f'msgs: {str(msgs)}')
+        enable_thinking = params.pop('enable_thinking', True)
+        is_streaming = params.pop('stream', False)
+        if is_streaming:
+            return self._stream_chat(image, msgs, enable_thinking, params, temporal_ids)
+        else:
+            chat_kwargs = {
+                "image": image,
+                "msgs": msgs,
+                "tokenizer": self.tokenizer,
+                "processor": self.processor,
+                "enable_thinking": enable_thinking,
+                **params
+            }
+            if temporal_ids is not None:
+                chat_kwargs["temporal_ids"] = temporal_ids
+            answer = self.model.chat(**chat_kwargs)
+            res = re.sub(r'(<box>.*</box>)', '', answer)
+            res = res.replace('<ref>', '')
+            res = res.replace('</ref>', '')
+            res = res.replace('<box>', '')
+            answer = res.replace('</box>', '')
+            if not enable_thinking:
+                print(f"enable_thinking: {enable_thinking}")
+                answer = answer.replace('</think>', '')
+            oids = self.tokenizer.encode(answer)
+            output_tokens = len(oids)
+            return answer, output_tokens
+    def _stream_chat(self, image, msgs, enable_thinking, params, temporal_ids=None):
+        try:
+            params['stream'] = True
+            chat_kwargs = {
+                "image": image,
+                "msgs": msgs,
+                "tokenizer": self.tokenizer,
+                "processor": self.processor,
+                "enable_thinking": enable_thinking,
+                **params
+            }
+            if temporal_ids is not None:
+                chat_kwargs["temporal_ids"] = temporal_ids
+            answer_generator = self.model.chat(**chat_kwargs)
+            if not hasattr(answer_generator, '__iter__'):
+                answer = answer_generator
+                res = re.sub(r'(<box>.*</box>)', '', answer)
+                res = res.replace('<ref>', '')
+                res = res.replace('</ref>', '')
+                res = res.replace('<box>', '')
+                answer = res.replace('</box>', '')
+                if not enable_thinking:
+                    answer = answer.replace('</think>', '')
+                char_count = 0
+                for char in answer:
+                    yield char
+                    char_count += 1
+            else:
+                full_answer = ""
+                chunk_count = 0
+                char_count = 0
+                for chunk in answer_generator:
+                    if isinstance(chunk, str):
+                        clean_chunk = re.sub(r'(<box>.*</box>)', '', chunk)
+                        clean_chunk = clean_chunk.replace('<ref>', '')
+                        clean_chunk = clean_chunk.replace('</ref>', '')
+                        clean_chunk = clean_chunk.replace('<box>', '')
+                        clean_chunk = clean_chunk.replace('</box>', '')
+                        if not enable_thinking:
+                            clean_chunk = clean_chunk.replace('</think>', '')
+                        full_answer += chunk
+                        char_count += len(clean_chunk)
+                        chunk_count += 1
+                        yield clean_chunk
+                    else:
+                        full_answer += str(chunk)
+                        char_count += len(str(chunk))
+                        chunk_count += 1
+                        yield str(chunk)
+        except Exception as e:
+            logger.error(f"Stream chat error: {e}")
+            yield f"Error: {str(e)}"

requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+huggingface_hub
+# Core dependencies
+spaces
+gradio==4.44.1
+torch==2.7.1
+torchvision==0.22.1
+numpy==2.2.6
+pillow
+scipy
+pandas==2.3.1
+# ML/AI dependencies
+transformers==4.55.0
+accelerate==1.9.0
+einops==0.8.1
+timm==1.0.19
+safetensors==0.5.3
+tokenizers==0.21.4
+huggingface-hub==0.34.3
+# Video processing
+decord==0.6.0
+ffmpy==0.6.1
+pydub==0.25.1
+# Utilities
+requests==2.32.4
+tqdm==4.67.1
+PyYAML==6.0.2
+psutil==7.0.0
+pydantic==2.10.6
+# Visualization
+matplotlib==3.10.5
+# Optional: For CUDA support (if needed)
+# triton==3.3.1