tc-mb commited on
Commit
7997f38
·
verified ·
1 Parent(s): 165abc3

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +2035 -0
  2. logging_util.py +34 -0
  3. models/__init__.py +1 -0
  4. models/minicpmv4_5.py +158 -0
  5. requirements.txt +37 -0
app.py ADDED
@@ -0,0 +1,2035 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import uuid
4
+ import time
5
+ import copy
6
+ import base64
7
+ import logging
8
+ import argparse
9
+ import math
10
+ import multiprocessing as mp
11
+ from io import BytesIO
12
+ from typing import Generator, Any, Dict, Optional
13
+
14
+ import spaces
15
+ import torch
16
+ import gradio as gr
17
+ import numpy as np
18
+ from PIL import Image
19
+ from decord import VideoReader, cpu
20
+ from scipy.spatial import cKDTree
21
+ # import modelscope_studio as mgr
22
+
23
+ # 导入模型相关模块
24
+ try:
25
+ from models import ModelMiniCPMV4_5
26
+ except ImportError:
27
+ print("Warning: models module not found. Please ensure models.py is available.")
28
+ class ModelMiniCPMV4_5:
29
+ def __init__(self, model_path):
30
+ self.model_path = model_path
31
+ self.model = None
32
+
33
+ def __call__(self, query):
34
+ return "Model not loaded", 0
35
+
36
+ # 全局配置
37
+ ERROR_MSG = "Error, please retry"
38
+ model_name = 'MiniCPM-V 4.5'
39
+ disable_text_only = False # 允许纯文本消息,便于测试
40
+ DOUBLE_FRAME_DURATION = 30
41
+ MAX_NUM_FRAMES = 180
42
+ MAX_NUM_PACKING = 3
43
+ TIME_SCALE = 0.1
44
+ IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
45
+ VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
46
+
47
+ ENABLE_PARALLEL_ENCODING = True
48
+ PARALLEL_PROCESSES = None
49
+
50
+ # 全局模型实例
51
+ global_model = None
52
+
53
+ # 日志配置
54
+ logging.basicConfig(level=logging.INFO)
55
+ logger = logging.getLogger(__name__)
56
+
57
+
58
+ # 全局模型配置
59
+ model_config = {
60
+ 'model_path': None,
61
+ 'model_type': None,
62
+ 'instance_id': 0
63
+ }
64
+
65
+ # 全局模型缓存(在GPU进程中)
66
+ _gpu_model_cache = None
67
+
68
+ def _initialize_gpu_model():
69
+ """在GPU进程中获取模型并移到GPU"""
70
+ global _gpu_model_cache
71
+ if _gpu_model_cache is None:
72
+ logger.info(f"在GPU进程中初始化模型: {model_config['model_type']}")
73
+
74
+ match model_config['model_type'].lower():
75
+ case 'minicpmv4_5':
76
+ _gpu_model_cache = ModelMiniCPMV4_5(model_config['model_path'])
77
+ case _:
78
+ raise ValueError(f"Unsupported model type: {model_config['model_type']}")
79
+
80
+ logger.info(f"模型在CPU上初始化完成")
81
+
82
+ # 每次推理时将模型移到GPU
83
+ if hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'to'):
84
+ logger.info("将模型移到GPU...")
85
+ _gpu_model_cache.model.to('cuda')
86
+ elif hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'model') and hasattr(_gpu_model_cache.model.model, 'to'):
87
+ logger.info("将模型移到GPU(嵌套模型)...")
88
+ _gpu_model_cache.model.model.to('cuda')
89
+
90
+ return _gpu_model_cache
91
+
92
+ @spaces.GPU
93
+ def gpu_handler(query):
94
+ """GPU推理处理器"""
95
+ model = _initialize_gpu_model()
96
+
97
+ res, output_tokens = model({
98
+ "image": query["image"],
99
+ "question": query["question"],
100
+ "params": query.get("params", "{}"),
101
+ "temporal_ids": query.get("temporal_ids", None)
102
+ })
103
+ return {
104
+ "result": res,
105
+ "usage": {"output_tokens": output_tokens}
106
+ }
107
+
108
+ @spaces.GPU
109
+ def gpu_stream_handler(query):
110
+ """GPU流式推理处理器"""
111
+ model = _initialize_gpu_model()
112
+
113
+ params = json.loads(query.get("params", "{}"))
114
+ params["stream"] = True
115
+ query["params"] = json.dumps(params)
116
+
117
+ try:
118
+ generator = model({
119
+ "image": query["image"],
120
+ "question": query["question"],
121
+ "params": query["params"],
122
+ "temporal_ids": query.get("temporal_ids", None)
123
+ })
124
+
125
+ # 收集生成器的所有输出,避免序列化问题
126
+ full_response = ""
127
+ for chunk in generator:
128
+ full_response += chunk
129
+
130
+ return full_response
131
+ except Exception as e:
132
+ logger.error(f"GPU stream handler error: {e}")
133
+ return f"Stream error: {str(e)}"
134
+
135
+ class Model:
136
+ """模型封装类,不持有实际模型对象"""
137
+
138
+ def __init__(self, model_path: str, model_type: str, instance_id: int = 0):
139
+ self.instance_id = instance_id
140
+ self.model_path = model_path
141
+ self.model_type = model_type
142
+
143
+ # 设置全局配置
144
+ model_config['model_path'] = model_path
145
+ model_config['model_type'] = model_type
146
+ model_config['instance_id'] = instance_id
147
+
148
+ logger.info(f"实例 {instance_id}: 配置模型类型 {model_type}")
149
+ logger.info(f"实例 {instance_id}: 模型路径 {model_path}")
150
+
151
+ def handler(self, query):
152
+ """非流式推理处理器"""
153
+ return gpu_handler(query)
154
+
155
+ def stream_handler(self, query):
156
+ """流式推理处理器"""
157
+ return gpu_stream_handler(query)
158
+
159
+
160
+ def initialize_model():
161
+ """初始化全局模型"""
162
+ global global_model, _gpu_model_cache
163
+
164
+ # 默认配置
165
+ model_path = os.getenv('MODEL_PATH', 'openbmb/MiniCPM-V-4_5')
166
+ model_type = os.getenv('MODEL_TYPE', 'minicpmv4_5')
167
+
168
+ logger.info(f"="*50)
169
+ logger.info(f"启动MiniCPM-V服务")
170
+ logger.info(f"模型路径: {model_path}")
171
+ logger.info(f"模型类型: {model_type}")
172
+ logger.info(f"="*50)
173
+
174
+ # 创建模型封装类
175
+ global_model = Model(model_path, model_type, 0)
176
+
177
+ # 在主进程中预加载模型到CPU(可选,为了更快的首次推理)
178
+ try:
179
+ logger.info("在主进程中预加载模型到CPU...")
180
+ match model_type.lower():
181
+ case 'minicpmv4_5':
182
+ _gpu_model_cache = ModelMiniCPMV4_5(model_path)
183
+ case _:
184
+ raise ValueError(f"Unsupported model type: {model_type}")
185
+
186
+ logger.info("模型在主进程CPU上预加载完成")
187
+ except Exception as e:
188
+ logger.warning(f"主进程预加载模型失败,将在GPU进程中加载: {e}")
189
+ _gpu_model_cache = None
190
+
191
+ return global_model
192
+
193
+
194
+ # 工具函数
195
+ def get_file_extension(filename):
196
+ return os.path.splitext(filename)[1].lower()
197
+
198
+
199
+ def is_image(filename):
200
+ return get_file_extension(filename) in IMAGE_EXTENSIONS
201
+
202
+
203
+ def is_video(filename):
204
+ return get_file_extension(filename) in VIDEO_EXTENSIONS
205
+
206
+
207
+ def map_to_nearest_scale(values, scale):
208
+ tree = cKDTree(np.asarray(scale)[:, None])
209
+ _, indices = tree.query(np.asarray(values)[:, None])
210
+ return np.asarray(scale)[indices]
211
+
212
+
213
+ def group_array(arr, size):
214
+ return [arr[i:i+size] for i in range(0, len(arr), size)]
215
+
216
+
217
+ def encode_image(image):
218
+ """编码单张图片"""
219
+ if not isinstance(image, Image.Image):
220
+ if hasattr(image, 'path'):
221
+ image = Image.open(image.path)
222
+ elif hasattr(image, 'file') and hasattr(image.file, 'path'):
223
+ image = Image.open(image.file.path)
224
+ elif hasattr(image, 'name'):
225
+ image = Image.open(image.name)
226
+ else:
227
+ image_path = getattr(image, 'url', getattr(image, 'orig_name', str(image)))
228
+ image = Image.open(image_path)
229
+
230
+ # 调整图片大小
231
+ max_size = 448*16
232
+ if max(image.size) > max_size:
233
+ w, h = image.size
234
+ if w > h:
235
+ new_w = max_size
236
+ new_h = int(h * max_size / w)
237
+ else:
238
+ new_h = max_size
239
+ new_w = int(w * max_size / h)
240
+ image = image.resize((new_w, new_h), resample=Image.BICUBIC)
241
+
242
+ # 转换为base64
243
+ buffered = BytesIO()
244
+ image.save(buffered, format="png")
245
+ im_b64 = base64.b64encode(buffered.getvalue()).decode()
246
+ return [{"type": "image", "pairs": im_b64}]
247
+
248
+
249
+ def encode_image_parallel(image_data):
250
+ """并行图片编码包装函数"""
251
+ try:
252
+ return encode_image(image_data)
253
+ except Exception as e:
254
+ print(f"[Parallel encoding error] Image encoding failed: {e}")
255
+ return None
256
+
257
+
258
+ def encode_images_parallel(frames, num_processes=None):
259
+ """多进程并行图片编码"""
260
+ if not ENABLE_PARALLEL_ENCODING:
261
+ print(f"[Parallel encoding] Parallel encoding disabled, using serial processing")
262
+ encoded_frames = []
263
+ for frame in frames:
264
+ encoded = encode_image(frame)
265
+ if encoded:
266
+ encoded_frames.extend(encoded)
267
+ return encoded_frames
268
+
269
+ if num_processes is None:
270
+ cpu_cores = mp.cpu_count()
271
+ if PARALLEL_PROCESSES:
272
+ num_processes = PARALLEL_PROCESSES
273
+ else:
274
+ if len(frames) >= 50:
275
+ num_processes = min(cpu_cores, len(frames), 32)
276
+ elif len(frames) >= 20:
277
+ num_processes = min(cpu_cores, len(frames), 16)
278
+ else:
279
+ num_processes = min(cpu_cores, len(frames), 8)
280
+
281
+ print(f"[Parallel encoding] Starting parallel encoding of {len(frames)} frame images, using {num_processes} processes")
282
+
283
+ if len(frames) <= 2:
284
+ print(f"[Parallel encoding] Few images ({len(frames)} frames), using serial processing")
285
+ encoded_frames = []
286
+ for frame in frames:
287
+ encoded = encode_image(frame)
288
+ if encoded:
289
+ encoded_frames.extend(encoded)
290
+ return encoded_frames
291
+
292
+ start_time = time.time()
293
+ try:
294
+ with mp.Pool(processes=num_processes) as pool:
295
+ results = pool.map(encode_image_parallel, frames)
296
+
297
+ encoded_frames = []
298
+ for result in results:
299
+ if result:
300
+ encoded_frames.extend(result)
301
+
302
+ total_time = time.time() - start_time
303
+ print(f"[Parallel encoding] Parallel encoding completed, total time: {total_time:.3f}s, encoded {len(encoded_frames)} images")
304
+
305
+ return encoded_frames
306
+
307
+ except Exception as e:
308
+ print(f"[Parallel encoding] Parallel processing failed, falling back to serial processing: {e}")
309
+ encoded_frames = []
310
+ for frame in frames:
311
+ encoded = encode_image(frame)
312
+ if encoded:
313
+ encoded_frames.extend(encoded)
314
+ return encoded_frames
315
+
316
+
317
+ def encode_video(video, choose_fps=None):
318
+ """编码视频文件"""
319
+ def uniform_sample(l, n):
320
+ gap = len(l) / n
321
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
322
+ return [l[i] for i in idxs]
323
+
324
+ if hasattr(video, 'path'):
325
+ video_path = video.path
326
+ elif hasattr(video, 'file') and hasattr(video.file, 'path'):
327
+ video_path = video.file.path
328
+ elif hasattr(video, 'name'):
329
+ video_path = video.name
330
+ else:
331
+ video_path = getattr(video, 'url', getattr(video, 'orig_name', str(video)))
332
+
333
+ vr = VideoReader(video_path, ctx=cpu(0))
334
+ fps = vr.get_avg_fps()
335
+ video_duration = len(vr) / fps
336
+
337
+ frame_idx = [i for i in range(0, len(vr))]
338
+
339
+ effective_fps = choose_fps if choose_fps else 1
340
+
341
+ if video_duration < DOUBLE_FRAME_DURATION and effective_fps <= 5:
342
+ effective_fps = effective_fps * 2
343
+ packing_nums = 2
344
+ choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
345
+ elif effective_fps * int(video_duration) <= MAX_NUM_FRAMES:
346
+ packing_nums = 1
347
+ choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
348
+ else:
349
+ packing_size = math.ceil(video_duration * effective_fps / MAX_NUM_FRAMES)
350
+ if packing_size <= MAX_NUM_PACKING:
351
+ choose_frames = round(video_duration * effective_fps)
352
+ packing_nums = packing_size
353
+ else:
354
+ choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
355
+ packing_nums = MAX_NUM_PACKING
356
+
357
+ choose_idx = choose_frames
358
+
359
+ frame_idx = np.array(uniform_sample(frame_idx, choose_idx))
360
+ frames = vr.get_batch(frame_idx).asnumpy()
361
+
362
+ frame_idx_ts = frame_idx / fps
363
+ scale = np.arange(0, video_duration, TIME_SCALE)
364
+
365
+ frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
366
+ frame_ts_id = frame_ts_id.astype(np.int32)
367
+
368
+ assert len(frames) == len(frame_ts_id)
369
+
370
+ frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
371
+ frame_ts_id_group = group_array(frame_ts_id.tolist(), packing_nums)
372
+
373
+ print(f"[Performance] Starting image encoding, total {len(frames)} frames")
374
+
375
+ if ENABLE_PARALLEL_ENCODING:
376
+ print(f"[Image encoding] Using multi-process parallel encoding, CPU cores: {mp.cpu_count()}")
377
+ encoded_frames = encode_images_parallel(frames, PARALLEL_PROCESSES)
378
+ else:
379
+ print("[Warning] Parallel encoding disabled, using serial processing")
380
+ encoded_frames = []
381
+ for frame in frames:
382
+ encoded = encode_image(frame)
383
+ if encoded:
384
+ encoded_frames.extend(encoded)
385
+
386
+ return encoded_frames, frame_ts_id_group
387
+
388
+
389
+ # 响应处理函数
390
+ def parse_thinking_response(response_text):
391
+ """解析包含<think>标签的响应文本,支持流式解析"""
392
+ import re
393
+
394
+ # 完整的thinking标签匹配
395
+ complete_think_pattern = r'<think>(.*?)</think>'
396
+ thinking_matches = re.findall(complete_think_pattern, response_text, re.DOTALL)
397
+
398
+ if thinking_matches:
399
+ # 有完整的thinking标签
400
+ thinking_content = "\n\n".join(thinking_matches).strip()
401
+ print("thinking_content---:", thinking_content)
402
+ formal_answer = re.sub(complete_think_pattern, '', response_text, flags=re.DOTALL).strip()
403
+ return thinking_content, formal_answer
404
+ else:
405
+ # 检查是否有未完成的thinking标签
406
+ partial_think_match = re.search(r'<think>(.*?)$', response_text, re.DOTALL)
407
+ if partial_think_match:
408
+ # 有开始标签但没有结束标签,说明thinking内容正在输出中
409
+ # 返回特殊标识,表示正在thinking过程中
410
+ return "STREAMING", ""
411
+ else:
412
+ # 没有thinking标签,直接返回原文作为正式回答
413
+ return "", response_text.strip()
414
+
415
+
416
+ def parse_thinking_response_for_final(response_text):
417
+ """最终解析thinking响应,用于完成时的格式化"""
418
+ import re
419
+
420
+ # 首先尝试匹配完整的thinking标签
421
+ think_pattern = r'<think>(.*?)</think>'
422
+ thinking_matches = re.findall(think_pattern, response_text, re.DOTALL)
423
+
424
+ if thinking_matches:
425
+ thinking_content = "\n\n".join(thinking_matches).strip()
426
+ formal_answer = re.sub(think_pattern, '', response_text, flags=re.DOTALL).strip()
427
+ print(f"[parse_final] 找到完整thinking标签, thinking长度: {len(thinking_content)}, answer长度: {len(formal_answer)}")
428
+ else:
429
+ # 如果没有完整标签,检查是否有未闭合的<think>标签
430
+ if '<think>' in response_text:
431
+ think_start = response_text.find('<think>')
432
+ if think_start != -1:
433
+ # 提取thinking内容(从<think>之后到字符串结束)
434
+ thinking_content = response_text[think_start + 7:].strip() # 跳过<think>
435
+ # formal_answer是<think>之��的内容
436
+ formal_answer = response_text[:think_start].strip()
437
+
438
+ # 如果formal_answer为空,说明整个响应都是thinking内容
439
+ if not formal_answer:
440
+ formal_answer = "" # 没有正式回答
441
+
442
+ print(f"[parse_final] 找到未闭合thinking标签")
443
+ print(f"[parse_final] thinking内容: '{thinking_content[:100]}...'")
444
+ print(f"[parse_final] formal_answer: '{formal_answer[:100]}...'")
445
+ else:
446
+ thinking_content = ""
447
+ formal_answer = response_text.strip()
448
+ print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
449
+ else:
450
+ thinking_content = ""
451
+ formal_answer = response_text.strip()
452
+ print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
453
+
454
+ return thinking_content, formal_answer
455
+
456
+
457
+ def normalize_text_for_html(text):
458
+ """轻量级文本规范化"""
459
+ import re
460
+
461
+ if not text:
462
+ return ""
463
+
464
+ text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
465
+ lines = [line.strip() for line in text.split("\n")]
466
+ text = "\n".join(lines)
467
+ text = text.strip()
468
+ return text
469
+
470
+
471
+ def format_response_with_thinking(thinking_content, formal_answer):
472
+ """格式化包含思考过程的响应"""
473
+ print(f"[format_thinking] thinking_content长度: {len(thinking_content) if thinking_content else 0}")
474
+ print(f"[format_thinking] formal_answer长度: {len(formal_answer) if formal_answer else 0}")
475
+ print(f"[format_thinking] thinking_content前100字符: '{thinking_content[:100] if thinking_content else 'None'}...'")
476
+ print(f"[format_thinking] formal_answer前100字符: '{formal_answer[:100] if formal_answer else 'None'}...'")
477
+
478
+ # 检查内容是否为空
479
+ if not thinking_content and not formal_answer:
480
+ print("[format_thinking] 警告:thinking_content和formal_answer都为空!")
481
+ elif not formal_answer:
482
+ print("[format_thinking] 警告:formal_answer为空!")
483
+ elif not thinking_content:
484
+ print("[format_thinking] 注意:thinking_content为空,将使用简化格式")
485
+
486
+ # 添加一个唯一的ID来强制前端重新渲染
487
+ import uuid
488
+ unique_id = uuid.uuid4().hex[:8]
489
+
490
+ # 如果有thinking内容,显示完整的thinking格式
491
+ if thinking_content and thinking_content.strip():
492
+ formatted_response = f"""
493
+ <div class="response-container" id="response-{unique_id}">
494
+ <div class="thinking-section">
495
+ <div class="thinking-header">🤔 think</div>
496
+ <div class="thinking-content">{thinking_content}</div>
497
+ </div>
498
+ <div class="formal-section">
499
+ <div class="formal-header">💡 answer</div>
500
+ <div class="formal-content">{formal_answer if formal_answer else '(无正式回答)'}</div>
501
+ </div>
502
+ </div>
503
+ """
504
+ else:
505
+ # 如果没有thinking内容,直接显示回答
506
+ content_to_show = formal_answer if formal_answer and formal_answer.strip() else "(空回答)"
507
+ formatted_response = f"""
508
+ <div class="response-container" id="response-{unique_id}">
509
+ <div class="formal-section">
510
+ <div class="formal-content">{content_to_show}</div>
511
+ </div>
512
+ </div>
513
+ """
514
+
515
+ return "\n" + formatted_response.strip() + "\n"
516
+
517
+
518
+ def check_mm_type(mm_file):
519
+ """检查多媒体文件类型"""
520
+ if hasattr(mm_file, 'path'):
521
+ path = mm_file.path
522
+ elif hasattr(mm_file, 'file') and hasattr(mm_file.file, 'path'):
523
+ path = mm_file.file.path
524
+ elif hasattr(mm_file, 'name'):
525
+ path = mm_file.name
526
+ else:
527
+ path = getattr(mm_file, 'url', getattr(mm_file, 'orig_name', str(mm_file)))
528
+
529
+ if is_image(path):
530
+ return "image"
531
+ if is_video(path):
532
+ return "video"
533
+ return None
534
+
535
+
536
+ def encode_mm_file(mm_file, choose_fps=None):
537
+ """编码多媒体文件"""
538
+ if check_mm_type(mm_file) == 'image':
539
+ return encode_image(mm_file), None
540
+ if check_mm_type(mm_file) == 'video':
541
+ encoded_frames, frame_ts_id_group = encode_video(mm_file, choose_fps)
542
+ return encoded_frames, frame_ts_id_group
543
+ return None, None
544
+
545
+
546
+ def encode_message(_question, choose_fps=None):
547
+ """编码消息"""
548
+ import re
549
+
550
+ files = _question.files if _question.files else []
551
+ question = _question.text if _question.text else ""
552
+ message = []
553
+ temporal_ids = []
554
+
555
+ # 检查是否使用旧的占位符格式
556
+ pattern = r"\[mm_media\]\d+\[/mm_media\]"
557
+ if re.search(pattern, question):
558
+ # 旧格式:使用占位符
559
+ matches = re.split(pattern, question)
560
+
561
+ if len(matches) != len(files) + 1:
562
+ gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
563
+ # 不使用 assert,而是处理不匹配的情况
564
+ if len(matches) > len(files) + 1:
565
+ matches = matches[:len(files) + 1]
566
+ else:
567
+ while len(matches) < len(files) + 1:
568
+ matches.append("")
569
+
570
+ text = matches[0].strip()
571
+ if text:
572
+ message.append({"type": "text", "pairs": text})
573
+
574
+ for i in range(len(files)):
575
+ encoded_content, frame_ts_id_group = encode_mm_file(files[i], choose_fps)
576
+ if encoded_content:
577
+ message += encoded_content
578
+ if frame_ts_id_group:
579
+ temporal_ids.extend(frame_ts_id_group)
580
+
581
+ if i + 1 < len(matches):
582
+ text = matches[i + 1].strip()
583
+ if text:
584
+ message.append({"type": "text", "pairs": text})
585
+ else:
586
+ # 新格式:简单的文本 + 文件列表
587
+ if question.strip():
588
+ message.append({"type": "text", "pairs": question.strip()})
589
+
590
+ for file in files:
591
+ encoded_content, frame_ts_id_group = encode_mm_file(file, choose_fps)
592
+ if encoded_content:
593
+ message += encoded_content
594
+ if frame_ts_id_group:
595
+ temporal_ids.extend(frame_ts_id_group)
596
+
597
+ return message, temporal_ids if temporal_ids else None
598
+
599
+
600
+ def check_has_videos(_question):
601
+ """检查是否包含视频"""
602
+ images_cnt = 0
603
+ videos_cnt = 0
604
+ files = _question.files if _question.files else []
605
+ for file in files:
606
+ if check_mm_type(file) == "image":
607
+ images_cnt += 1
608
+ else:
609
+ videos_cnt += 1
610
+ return images_cnt, videos_cnt
611
+
612
+
613
+ def save_media_to_persistent_cache(_question, session_id):
614
+ """将图片和视频保存到持久化缓存中,返回保存的路径信息"""
615
+ import os
616
+ import shutil
617
+ import uuid
618
+ from pathlib import Path
619
+
620
+ files = _question.files if _question.files else []
621
+ saved_media = []
622
+
623
+ # 创建会话专用的媒体缓存目录
624
+ cache_dir = Path("./media_cache") / session_id
625
+ cache_dir.mkdir(parents=True, exist_ok=True)
626
+
627
+ for file in files:
628
+ file_type = check_mm_type(file)
629
+ if file_type in ["image", "video"]:
630
+ try:
631
+ # 获取原始文件路径
632
+ original_path = None
633
+ if hasattr(file, 'name'):
634
+ original_path = file.name
635
+ elif hasattr(file, 'path'):
636
+ original_path = file.path
637
+ elif hasattr(file, 'file') and hasattr(file.file, 'path'):
638
+ original_path = file.file.path
639
+ else:
640
+ continue
641
+
642
+ if original_path and os.path.exists(original_path):
643
+ # 生成唯一的文件名
644
+ file_ext = os.path.splitext(original_path)[1]
645
+ prefix = "img" if file_type == "image" else "vid"
646
+ unique_filename = f"{prefix}_{uuid.uuid4().hex[:8]}{file_ext}"
647
+ cached_path = cache_dir / unique_filename
648
+
649
+ # 复制文件到缓存目录
650
+ shutil.copy2(original_path, cached_path)
651
+
652
+ saved_media.append({
653
+ 'type': file_type,
654
+ 'original_path': original_path,
655
+ 'cached_path': str(cached_path),
656
+ 'filename': unique_filename
657
+ })
658
+ print(f"[save_media_to_persistent_cache] {file_type}已保存: {cached_path}")
659
+ except Exception as e:
660
+ print(f"[save_media_to_persistent_cache] 保存{file_type}失败: {e}")
661
+ continue
662
+
663
+ return saved_media
664
+
665
+
666
+ def format_user_message_with_files(_question, session_id=None):
667
+ """格式化包含文件的用户消息,支持图片和视频显示"""
668
+ user_text = _question.text if _question.text else ""
669
+ files = _question.files if _question.files else []
670
+
671
+ if not files:
672
+ return user_text, []
673
+
674
+ # 保存媒体文件到持久化缓存
675
+ saved_media = []
676
+ if session_id:
677
+ saved_media = save_media_to_persistent_cache(_question, session_id)
678
+
679
+ if len(files) == 1:
680
+ file = files[0]
681
+ file_type = check_mm_type(file)
682
+
683
+ # 如果是图片或视频且已保存到缓存
684
+ if file_type in ["image", "video"] and saved_media:
685
+ media_info = saved_media[0]
686
+ if file_type == "image":
687
+ if user_text:
688
+ return f"🖼️ {user_text}", saved_media
689
+ else:
690
+ return "🖼️ 图片", saved_media
691
+ elif file_type == "video":
692
+ if user_text:
693
+ return f"🎬 {user_text}", saved_media
694
+ else:
695
+ return "🎬 视频", saved_media
696
+ else:
697
+ # 其他文件类型,使用文本描述
698
+ return f"[1 file uploaded] {user_text}", saved_media
699
+ else:
700
+ # 多个文件,统计不同类型
701
+ image_count = len([m for m in saved_media if m['type'] == 'image'])
702
+ video_count = len([m for m in saved_media if m['type'] == 'video'])
703
+ other_count = len(files) - image_count - video_count
704
+
705
+ # 构建描述文本
706
+ parts = []
707
+ if image_count > 0:
708
+ parts.append(f"{image_count} image{'s' if image_count > 1 else ''}")
709
+ if video_count > 0:
710
+ parts.append(f"{video_count} video{'s' if video_count > 1 else ''}")
711
+ if other_count > 0:
712
+ parts.append(f"{other_count} other file{'s' if other_count > 1 else ''}")
713
+
714
+ if parts:
715
+ files_desc = ", ".join(parts)
716
+ return f"[{files_desc} uploaded] {user_text}", saved_media
717
+ else:
718
+ return f"[{len(files)} files uploaded] {user_text}", saved_media
719
+
720
+
721
+ def update_media_gallery(app_session):
722
+ """更新媒体画廊显示(图片和视频)"""
723
+ import os
724
+ media_cache = app_session.get('media_cache', [])
725
+
726
+ if not media_cache:
727
+ return gr.update(value=[], visible=False)
728
+
729
+ # 获取所有缓存媒体文件的路径(图片和视频都支持)
730
+ media_paths = [media_info['cached_path'] for media_info in media_cache if os.path.exists(media_info['cached_path'])]
731
+
732
+ if media_paths:
733
+ return gr.update(value=media_paths, visible=True)
734
+ else:
735
+ return gr.update(value=[], visible=False)
736
+
737
+
738
+ def format_fewshot_user_message(image_path, user_text):
739
+ """格式化FewShot用户消息,支持图片显示"""
740
+ if image_path and user_text:
741
+ return (user_text, image_path)
742
+ elif image_path:
743
+ return ("", image_path)
744
+ else:
745
+ return user_text
746
+
747
+
748
+ # 主要的聊天函数
749
+ def chat_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
750
+ """直接调用模型进行聊天(非流式)"""
751
+ default_params = {"num_beams": 3, "repetition_penalty": 1.2, "max_new_tokens": 16284}
752
+ if params is None:
753
+ params = default_params
754
+
755
+ use_streaming = params.get('stream', False)
756
+
757
+ if use_streaming:
758
+ return chat_stream_direct(img_b64, msgs, ctx, params, vision_hidden_states, temporal_ids, session_id)
759
+ else:
760
+ # 构建请求数据
761
+ query = {
762
+ "image": img_b64,
763
+ "question": json.dumps(msgs, ensure_ascii=True),
764
+ "params": json.dumps(params, ensure_ascii=True),
765
+ }
766
+
767
+ if temporal_ids:
768
+ query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
769
+
770
+ if session_id:
771
+ query["session_id"] = session_id
772
+
773
+ try:
774
+ # 直接调用模型
775
+ result = global_model.handler(query)
776
+ raw_result = result['result']
777
+
778
+ # 清理结果
779
+ import re
780
+ cleaned_result = re.sub(r'(<box>.*</box>)', '', raw_result)
781
+ cleaned_result = cleaned_result.replace('<ref>', '')
782
+ cleaned_result = cleaned_result.replace('</ref>', '')
783
+ cleaned_result = cleaned_result.replace('<box>', '')
784
+ cleaned_result = cleaned_result.replace('</box>', '')
785
+
786
+ # 解析思考过程
787
+ thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
788
+ thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
789
+ formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
790
+ formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
791
+
792
+ context_result = formal_answer_raw if formal_answer_raw else cleaned_result
793
+ return 0, formatted_result, context_result, None
794
+
795
+ except Exception as e:
796
+ print(f"Chat error: {e}")
797
+ import traceback
798
+ traceback.print_exc()
799
+ return -1, ERROR_MSG, None, None
800
+
801
+
802
+ def chat_stream_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
803
+ """直接调用模型进行流式聊天"""
804
+ try:
805
+ # 构建请求数据
806
+ query = {
807
+ "image": img_b64,
808
+ "question": json.dumps(msgs, ensure_ascii=True),
809
+ "params": json.dumps(params, ensure_ascii=True),
810
+ }
811
+
812
+ if temporal_ids:
813
+ query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
814
+
815
+ if session_id:
816
+ query["session_id"] = session_id
817
+
818
+ # 直接调用流式处理器
819
+ generator = global_model.stream_handler(query)
820
+
821
+ full_response = ""
822
+ for chunk in generator:
823
+ full_response += chunk
824
+
825
+ if not full_response:
826
+ return -1, ERROR_MSG, None, None
827
+
828
+ # 清理结果
829
+ import re
830
+ cleaned_result = re.sub(r'(<box>.*</box>)', '', full_response)
831
+ cleaned_result = cleaned_result.replace('<ref>', '')
832
+ cleaned_result = cleaned_result.replace('</ref>', '')
833
+ cleaned_result = cleaned_result.replace('<box>', '')
834
+ cleaned_result = cleaned_result.replace('</box>', '')
835
+
836
+ # 解析思考过程
837
+ thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
838
+ thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
839
+ formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
840
+ formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
841
+
842
+ context_result = formal_answer_raw if formal_answer_raw else cleaned_result
843
+ return 0, formatted_result, context_result, None
844
+
845
+ except Exception as e:
846
+ print(f"Stream chat error: {e}")
847
+ import traceback
848
+ traceback.print_exc()
849
+ return -1, ERROR_MSG, None, None
850
+
851
+
852
+ def chat_stream_character_generator(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, stop_control=None, session_id=None):
853
+ """字符级流式生成器"""
854
+ print(f"[chat_stream_character_generator] Starting character-level streaming")
855
+ print(f"[chat_stream_character_generator] stop_control: {stop_control}")
856
+
857
+ try:
858
+ # 构建请求数据
859
+ query = {
860
+ "image": img_b64,
861
+ "question": json.dumps(msgs, ensure_ascii=True),
862
+ "params": json.dumps(params, ensure_ascii=True),
863
+ }
864
+
865
+ if temporal_ids:
866
+ query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
867
+
868
+ if session_id:
869
+ query["session_id"] = session_id
870
+
871
+ # 调用流式处理器 - 现在返回完整响应而不是生成器
872
+ full_response = global_model.stream_handler(query)
873
+
874
+ # 清理响应
875
+ import re
876
+ clean_response = re.sub(r'(<box>.*</box>)', '', full_response)
877
+ clean_response = clean_response.replace('<ref>', '')
878
+ clean_response = clean_response.replace('</ref>', '')
879
+ clean_response = clean_response.replace('<box>', '')
880
+ clean_response = clean_response.replace('</box>', '')
881
+
882
+ # 逐字符yield以模拟流式输出
883
+ char_count = 0
884
+ for char in clean_response:
885
+ # 检查停止标志
886
+ if stop_control and stop_control.get('stop_streaming', False):
887
+ print(f"[chat_stream_character_generator] *** 在第{char_count}个字符处收到停止信号 ***")
888
+ break
889
+
890
+ char_count += 1
891
+ if char_count % 10 == 0:
892
+ print(f"[chat_stream_character_generator] 已输出{char_count}个字符,stop_flag: {stop_control.get('stop_streaming', False) if stop_control else 'None'}")
893
+
894
+ yield char
895
+
896
+ # 添加小延迟以模拟流式效果
897
+ import time
898
+ time.sleep(0.01)
899
+
900
+ print(f"[chat_stream_character_generator] 流式输出完成,总共输出{char_count}个字符")
901
+
902
+ except Exception as e:
903
+ print(f"[chat_stream_character_generator] 异常: {e}")
904
+ error_msg = f"Stream error: {str(e)}"
905
+ for char in error_msg:
906
+ yield char
907
+
908
+
909
+ # UI组件创建函数
910
+ def create_component(params, comp='Slider'):
911
+ if comp == 'Slider':
912
+ return gr.Slider(
913
+ minimum=params['minimum'],
914
+ maximum=params['maximum'],
915
+ value=params['value'],
916
+ step=params['step'],
917
+ interactive=params['interactive'],
918
+ label=params['label']
919
+ )
920
+ elif comp == 'Radio':
921
+ return gr.Radio(
922
+ choices=params['choices'],
923
+ value=params['value'],
924
+ interactive=params['interactive'],
925
+ label=params['label']
926
+ )
927
+ elif comp == 'Button':
928
+ return gr.Button(
929
+ value=params['value'],
930
+ interactive=True
931
+ )
932
+ elif comp == 'Checkbox':
933
+ return gr.Checkbox(
934
+ value=params['value'],
935
+ interactive=params['interactive'],
936
+ label=params['label'],
937
+ info=params.get('info', None)
938
+ )
939
+
940
+
941
+ def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
942
+ # 使用标准的 Gradio 组件替代 MultimodalInput,添加预览功能
943
+ return gr.File(
944
+ file_count="multiple",
945
+ file_types=["image", "video"],
946
+ label="Upload Images/Videos",
947
+ interactive=not (upload_image_disabled and upload_video_disabled),
948
+ show_label=True,
949
+ height=200 # 设置高度以显示预览
950
+ )
951
+
952
+
953
+ # UI控制函数
954
+ def update_streaming_mode_state(params_form):
955
+ """根据解码类型更新流式模式状态"""
956
+ if params_form == 'Beam Search':
957
+ return gr.update(value=False, interactive=False, info="Beam Search mode does not support streaming output")
958
+ else:
959
+ return gr.update(value=True, interactive=True, info="Enable real-time streaming response")
960
+
961
+
962
+ def stop_streaming(_app_cfg):
963
+ """停止流式输出"""
964
+ _app_cfg['stop_streaming'] = True
965
+ print(f"[stop_streaming] Set stop flag to True")
966
+ return _app_cfg
967
+
968
+
969
+ def reset_stop_flag(_app_cfg):
970
+ """重置停止标志"""
971
+ _app_cfg['stop_streaming'] = False
972
+ print(f"[reset_stop_flag] Reset stop flag to False")
973
+ return _app_cfg
974
+
975
+
976
+ def check_and_handle_stop(_app_cfg, context="unknown"):
977
+ """检查停止标志"""
978
+ should_stop = _app_cfg.get('stop_streaming', False)
979
+ is_streaming = _app_cfg.get('is_streaming', False)
980
+
981
+ if should_stop:
982
+ print(f"[check_and_handle_stop] *** Stop signal detected at {context} ***")
983
+ print(f"[check_and_handle_stop] stop_streaming: {should_stop}, is_streaming: {is_streaming}")
984
+ return True
985
+ return False
986
+
987
+
988
+ def stop_button_clicked(_app_cfg):
989
+ """处理停止按钮点击"""
990
+ print("[stop_button_clicked] *** Stop button clicked ***")
991
+ print(f"[stop_button_clicked] Current state - is_streaming: {_app_cfg.get('is_streaming', False)}")
992
+ print(f"[stop_button_clicked] Current state - stop_streaming: {_app_cfg.get('stop_streaming', False)}")
993
+
994
+ _app_cfg['stop_streaming'] = True
995
+ _app_cfg['is_streaming'] = False
996
+ print(f"[stop_button_clicked] Set stop_streaming = True, is_streaming = False")
997
+
998
+ return _app_cfg, gr.update(visible=False)
999
+
1000
+
1001
+ # 主要的响应函数
1002
+ def respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1003
+ """流式响应生成器"""
1004
+ print(f"[respond_stream] Called with streaming_mode: {streaming_mode}, fps_setting: {fps_setting}")
1005
+
1006
+ _app_cfg['is_streaming'] = True
1007
+ _app_cfg['stop_streaming'] = False
1008
+
1009
+ if params_form == 'Beam Search':
1010
+ streaming_mode = False
1011
+ print(f"[respond_stream] Beam Search模式,强制禁用流式模式")
1012
+ _app_cfg['is_streaming'] = False
1013
+
1014
+ _context = _app_cfg['ctx'].copy()
1015
+ encoded_message, temporal_ids = encode_message(_question, fps_setting)
1016
+ _context.append({'role': 'user', 'contents': encoded_message})
1017
+
1018
+ images_cnt = _app_cfg['images_cnt']
1019
+ videos_cnt = _app_cfg['videos_cnt']
1020
+ files_cnts = check_has_videos(_question)
1021
+
1022
+ if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
1023
+ gr.Warning("Only supports single video file input right now!")
1024
+ yield create_multimodal_input(True, True), _chat_bot, _app_cfg, gr.update(visible=False)
1025
+ return
1026
+
1027
+ if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
1028
+ gr.Warning("Please chat with at least one image or video.")
1029
+ yield create_multimodal_input(False, False), _chat_bot, _app_cfg, gr.update(visible=False)
1030
+ return
1031
+
1032
+ if params_form == 'Beam Search':
1033
+ params = {
1034
+ 'sampling': False,
1035
+ 'num_beams': 3,
1036
+ 'repetition_penalty': 1.2,
1037
+ "max_new_tokens": 16284,
1038
+ "enable_thinking": thinking_mode,
1039
+ "stream": False
1040
+ }
1041
+ else:
1042
+ params = {
1043
+ 'sampling': True,
1044
+ 'top_p': 0.8,
1045
+ 'top_k': 100,
1046
+ 'temperature': 0.7,
1047
+ 'repetition_penalty': 1.03,
1048
+ "max_new_tokens": 16284,
1049
+ "enable_thinking": thinking_mode,
1050
+ "stream": streaming_mode
1051
+ }
1052
+
1053
+ if files_cnts[1] + videos_cnt > 0:
1054
+ params["max_inp_length"] = 2048 * 10
1055
+ params["use_image_id"] = False
1056
+ params["max_slice_nums"] = 1
1057
+
1058
+ images_cnt += files_cnts[0]
1059
+ videos_cnt += files_cnts[1]
1060
+
1061
+ # 构建用户消息显示(流式模式)
1062
+ user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
1063
+
1064
+ # 将媒体信息保存到会话状态中
1065
+ if saved_images:
1066
+ if 'media_cache' not in _app_cfg:
1067
+ _app_cfg['media_cache'] = []
1068
+ _app_cfg['media_cache'].extend(saved_images)
1069
+
1070
+ _chat_bot.append((user_message, ""))
1071
+ _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
1072
+
1073
+ gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
1074
+
1075
+ upload_image_disabled = videos_cnt > 0
1076
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1077
+
1078
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
1079
+
1080
+ print(f"[respond_stream] 开始字符级流式输出循环")
1081
+ char_count = 0
1082
+ accumulated_content = ""
1083
+
1084
+ for _char in gen:
1085
+ char_count += 1
1086
+
1087
+ if check_and_handle_stop(_app_cfg, f"字符{char_count}"):
1088
+ break
1089
+
1090
+ accumulated_content += _char
1091
+ _context[-1]["contents"][0]["pairs"] += _char
1092
+
1093
+ # 实时显示内容(thinking模式也实时显示)
1094
+ if thinking_mode:
1095
+ # 尝试解析当前累积的内容
1096
+ thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
1097
+
1098
+ # 如果解析出了完整的thinking内容,使用格式化显示
1099
+ if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
1100
+ thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1101
+ formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1102
+ formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1103
+ _chat_bot[-1] = (user_message, formatted_display)
1104
+ else:
1105
+ # 正在thinking过程中或者还没有完整标签,直接显示原始内容(实时流式)
1106
+ _chat_bot[-1] = (user_message, accumulated_content)
1107
+ else:
1108
+ # 非thinking模式,直接显示累积内容
1109
+ _chat_bot[-1] = (user_message, accumulated_content)
1110
+
1111
+ if char_count % 5 == 0: # 更频繁的更新以提供更好的流式体验
1112
+ print(f"[respond_stream] 已处理{char_count}个字符,stop_flag: {_app_cfg.get('stop_streaming', False)}")
1113
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
1114
+ time.sleep(0.02) # 稍微增加延迟以避免过于频繁的更新
1115
+ else:
1116
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
1117
+
1118
+ if _app_cfg.get('stop_streaming', False):
1119
+ print("[respond_stream] 流式输出已停止")
1120
+
1121
+ # 最终处理thinking格式化
1122
+ final_content = accumulated_content
1123
+ if thinking_mode:
1124
+ thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(final_content)
1125
+ thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1126
+ formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1127
+ formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1128
+
1129
+ _chat_bot[-1] = (user_message, formatted_result)
1130
+ _context[-1]["contents"][0]["pairs"] = formal_answer_raw if formal_answer_raw else final_content
1131
+ else:
1132
+ _chat_bot[-1] = (user_message, final_content)
1133
+ _context[-1]["contents"][0]["pairs"] = final_content
1134
+
1135
+ _app_cfg['ctx'] = _context
1136
+ _app_cfg['images_cnt'] = images_cnt
1137
+ _app_cfg['videos_cnt'] = videos_cnt
1138
+ _app_cfg['is_streaming'] = False
1139
+
1140
+ upload_image_disabled = videos_cnt > 0
1141
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1142
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1143
+
1144
+
1145
+ def respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1146
+ """主响应函数"""
1147
+ if 'session_id' not in _app_cfg:
1148
+ _app_cfg['session_id'] = uuid.uuid4().hex[:16]
1149
+ print(f"[会话] 为现有会话生成session_id: {_app_cfg['session_id']}")
1150
+
1151
+ # 记录thinking模式状态变化
1152
+ prev_thinking_mode = _app_cfg.get('last_thinking_mode', False)
1153
+ _app_cfg['last_thinking_mode'] = thinking_mode
1154
+
1155
+ if prev_thinking_mode != thinking_mode:
1156
+ print(f"[respond] Thinking模式切换: {prev_thinking_mode} -> {thinking_mode}")
1157
+ # 强制清理可能的缓存状态
1158
+ if hasattr(_app_cfg, 'thinking_cache'):
1159
+ del _app_cfg['thinking_cache']
1160
+ # 添加额外的状态重置
1161
+ if thinking_mode and not prev_thinking_mode:
1162
+ print("[respond] 启用thinking模式,重置相关状态")
1163
+ _app_cfg['thinking_enabled'] = True
1164
+ elif not thinking_mode and prev_thinking_mode:
1165
+ print("[respond] 禁用thinking模式")
1166
+ _app_cfg['thinking_enabled'] = False
1167
+
1168
+ if params_form == 'Beam Search':
1169
+ streaming_mode = False
1170
+ print(f"[respond] Beam Search模式,强制禁用流式模式")
1171
+
1172
+ if streaming_mode:
1173
+ print("[respond] 选择流式模式")
1174
+ yield from respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting)
1175
+ return
1176
+
1177
+ # 非流式模式
1178
+ _context = _app_cfg['ctx'].copy()
1179
+ encoded_message, temporal_ids = encode_message(_question, fps_setting)
1180
+ _context.append({'role': 'user', 'contents': encoded_message})
1181
+
1182
+ images_cnt = _app_cfg['images_cnt']
1183
+ videos_cnt = _app_cfg['videos_cnt']
1184
+ files_cnts = check_has_videos(_question)
1185
+ if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
1186
+ gr.Warning("Only supports single video file input right now!")
1187
+ upload_image_disabled = videos_cnt > 0
1188
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1189
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1190
+ return
1191
+ if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
1192
+ gr.Warning("Please chat with at least one image or video.")
1193
+ upload_image_disabled = videos_cnt > 0
1194
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1195
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1196
+ return
1197
+
1198
+ if params_form == 'Beam Search':
1199
+ params = {
1200
+ 'sampling': False,
1201
+ 'num_beams': 3,
1202
+ 'repetition_penalty': 1.2,
1203
+ "max_new_tokens": 16284,
1204
+ "enable_thinking": thinking_mode,
1205
+ "stream": False
1206
+ }
1207
+ else:
1208
+ params = {
1209
+ 'sampling': True,
1210
+ 'top_p': 0.8,
1211
+ 'top_k': 100,
1212
+ 'temperature': 0.7,
1213
+ 'repetition_penalty': 1.03,
1214
+ "max_new_tokens": 16284,
1215
+ "enable_thinking": thinking_mode,
1216
+ "stream": False
1217
+ }
1218
+
1219
+ if files_cnts[1] + videos_cnt > 0:
1220
+ params["max_inp_length"] = 2048 * 10
1221
+ params["use_image_id"] = False
1222
+ params["max_slice_nums"] = 1
1223
+
1224
+ # 调用聊天函数
1225
+ code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
1226
+
1227
+ images_cnt += files_cnts[0]
1228
+ videos_cnt += files_cnts[1]
1229
+
1230
+ if code == 0:
1231
+ context_content = _context_answer if _context_answer else _answer
1232
+ _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
1233
+
1234
+ # 根据thinking_mode决定是否应用thinking格式化
1235
+ if thinking_mode:
1236
+ thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(_answer)
1237
+ thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1238
+ formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1239
+ print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, thinking_content: '{thinking_content_raw[:50]}...'")
1240
+ formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1241
+ else:
1242
+ print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, 使用原始回答")
1243
+ formatted_result = _answer
1244
+
1245
+ # 构建用户消息显示
1246
+ user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
1247
+
1248
+ # 将媒体信息保存到会话状态中
1249
+ if saved_images:
1250
+ if 'media_cache' not in _app_cfg:
1251
+ _app_cfg['media_cache'] = []
1252
+ _app_cfg['media_cache'].extend(saved_images)
1253
+
1254
+ _chat_bot.append((user_message, formatted_result))
1255
+
1256
+ _app_cfg['ctx'] = _context
1257
+ _app_cfg['sts'] = sts
1258
+ else:
1259
+ _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": "Error occurred during processing"}]})
1260
+ # 构建用户消息显示(错误情况)
1261
+ user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
1262
+
1263
+ # 将媒体信息保存到会话状态中
1264
+ if saved_images:
1265
+ if 'media_cache' not in _app_cfg:
1266
+ _app_cfg['media_cache'] = []
1267
+ _app_cfg['media_cache'].extend(saved_images)
1268
+
1269
+ _chat_bot.append((user_message, "Error occurred during processing"))
1270
+
1271
+ _app_cfg['images_cnt'] = images_cnt
1272
+ _app_cfg['videos_cnt'] = videos_cnt
1273
+ _app_cfg['is_streaming'] = False
1274
+
1275
+ upload_image_disabled = videos_cnt > 0
1276
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1277
+
1278
+ # 统一使用yield返回结果,确保与流式模式兼容
1279
+ yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1280
+
1281
+
1282
+ # FewShot相关函数
1283
+ def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
1284
+ if 'session_id' not in _app_cfg:
1285
+ _app_cfg['session_id'] = uuid.uuid4().hex[:16]
1286
+ print(f"[会话] 为FewShot示例生成session_id: {_app_cfg['session_id']}")
1287
+
1288
+ ctx = _app_cfg["ctx"]
1289
+
1290
+ # 构建用户消息
1291
+ user_msg = ""
1292
+ if _image is not None:
1293
+ image = Image.open(_image).convert("RGB")
1294
+ ctx.append({"role": "user", "contents": [
1295
+ *encode_image(image),
1296
+ {"type": "text", "pairs": _user_message}
1297
+ ]})
1298
+ user_msg = f"[Image uploaded] {_user_message}"
1299
+ else:
1300
+ if _user_message:
1301
+ ctx.append({"role": "user", "contents": [{"type": "text", "pairs": _user_message}]})
1302
+ user_msg = _user_message
1303
+
1304
+ # 构建助手消息
1305
+ if _assistant_message:
1306
+ ctx.append({"role": "assistant", "contents": [{"type": "text", "pairs": _assistant_message}]})
1307
+
1308
+ # 只有当用户消息和助手消息都存在时才添加到聊天记录
1309
+ if user_msg and _assistant_message:
1310
+ formatted_user_msg = format_fewshot_user_message(_image, _user_message) if _image else user_msg
1311
+ _chat_bot.append([formatted_user_msg, _assistant_message])
1312
+
1313
+ return None, "", "", _chat_bot, _app_cfg
1314
+
1315
+
1316
+ def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1317
+ """FewShot响应函数"""
1318
+ print(f"[fewshot_respond] Called with streaming_mode: {streaming_mode}")
1319
+
1320
+ if 'session_id' not in _app_cfg:
1321
+ _app_cfg['session_id'] = uuid.uuid4().hex[:16]
1322
+ print(f"[会话] 为FewShot会话生成session_id: {_app_cfg['session_id']}")
1323
+
1324
+ if params_form == 'Beam Search':
1325
+ streaming_mode = False
1326
+ print(f"[fewshot_respond] Beam Search模式,强制禁用流式模式")
1327
+
1328
+ user_message_contents = []
1329
+ _context = _app_cfg["ctx"].copy()
1330
+ images_cnt = _app_cfg["images_cnt"]
1331
+ temporal_ids = None
1332
+
1333
+ if _image:
1334
+ image = Image.open(_image).convert("RGB")
1335
+ user_message_contents += encode_image(image)
1336
+ images_cnt += 1
1337
+ if _user_message:
1338
+ user_message_contents += [{"type": "text", "pairs": _user_message}]
1339
+ if user_message_contents:
1340
+ _context.append({"role": "user", "contents": user_message_contents})
1341
+
1342
+ if params_form == 'Beam Search':
1343
+ params = {
1344
+ 'sampling': False,
1345
+ 'num_beams': 3,
1346
+ 'repetition_penalty': 1.2,
1347
+ "max_new_tokens": 16284,
1348
+ "enable_thinking": thinking_mode,
1349
+ "stream": False
1350
+ }
1351
+ else:
1352
+ params = {
1353
+ 'sampling': True,
1354
+ 'top_p': 0.8,
1355
+ 'top_k': 100,
1356
+ 'temperature': 0.7,
1357
+ 'repetition_penalty': 1.03,
1358
+ "max_new_tokens": 16284,
1359
+ "enable_thinking": thinking_mode,
1360
+ "stream": streaming_mode
1361
+ }
1362
+
1363
+ if disable_text_only and images_cnt == 0:
1364
+ gr.Warning("Please chat with at least one image or video.")
1365
+ yield _image, _user_message, '', _chat_bot, _app_cfg
1366
+ return
1367
+
1368
+ if streaming_mode:
1369
+ print(f"[fewshot_respond] Using streaming mode")
1370
+ _app_cfg['is_streaming'] = True
1371
+ _app_cfg['stop_streaming'] = False
1372
+
1373
+ if _image:
1374
+ user_msg = format_fewshot_user_message(_image, _user_message)
1375
+ _chat_bot.append([user_msg, ""])
1376
+ else:
1377
+ _chat_bot.append([_user_message, ""])
1378
+
1379
+ _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
1380
+
1381
+ _app_cfg['stop_streaming'] = False
1382
+
1383
+ gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
1384
+
1385
+ yield _image, _user_message, '', _chat_bot, _app_cfg
1386
+
1387
+ accumulated_content = ""
1388
+ for _char in gen:
1389
+ if _app_cfg.get('stop_streaming', False):
1390
+ print("[fewshot_respond] 收到停止信号,中断流式响应")
1391
+ break
1392
+
1393
+ accumulated_content += _char
1394
+ _context[-1]["contents"][0]["pairs"] += _char
1395
+
1396
+ # 实时解析和格式化thinking内容
1397
+ if thinking_mode:
1398
+ # 尝试解析当前累积的内容
1399
+ thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
1400
+
1401
+ # 如果解析出了完整的thinking内容,使用格式化显示
1402
+ if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
1403
+ thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1404
+ formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1405
+ formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1406
+ _chat_bot[-1] = (_chat_bot[-1][0], formatted_display)
1407
+ else:
1408
+ # 正在thinking过程中或者还没有完整标签,直接显示原始内容(实时流式)
1409
+ _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
1410
+ else:
1411
+ # 非thinking模式,直接显示累积内容
1412
+ _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
1413
+
1414
+ yield _image, _user_message, '', _chat_bot, _app_cfg
1415
+
1416
+ final_content = _context[-1]["contents"][0]["pairs"]
1417
+
1418
+ _app_cfg['ctx'] = _context
1419
+ _app_cfg['images_cnt'] = images_cnt
1420
+ _app_cfg['is_streaming'] = False
1421
+
1422
+ yield _image, '', '', _chat_bot, _app_cfg
1423
+
1424
+ else:
1425
+ # 非流式模式
1426
+ code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
1427
+
1428
+ context_content = _context_answer if _context_answer else _answer
1429
+ _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
1430
+
1431
+ if _image:
1432
+ user_msg = format_fewshot_user_message(_image, _user_message)
1433
+ _chat_bot.append([user_msg, _answer])
1434
+ else:
1435
+ _chat_bot.append([_user_message, _answer])
1436
+
1437
+ if code == 0:
1438
+ _app_cfg['ctx'] = _context
1439
+ _app_cfg['sts'] = sts
1440
+ _app_cfg['images_cnt'] = images_cnt
1441
+
1442
+ _app_cfg['is_streaming'] = False
1443
+ yield None, '', '', _chat_bot, _app_cfg
1444
+
1445
+
1446
+ # 其他UI函数
1447
+ def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1448
+ print(f"[regenerate] streaming_mode: {streaming_mode}")
1449
+ print(f"[regenerate] thinking_mode: {thinking_mode}")
1450
+ print(f"[regenerate] chat_type: {_app_cfg.get('chat_type', 'unknown')}")
1451
+
1452
+ if params_form == 'Beam Search':
1453
+ streaming_mode = False
1454
+ print(f"[regenerate] Beam Search模式,强制禁用流式模式")
1455
+
1456
+ if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
1457
+ gr.Warning('No question for regeneration.')
1458
+ yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
1459
+ return
1460
+
1461
+ if _app_cfg["chat_type"] == "Chat":
1462
+ images_cnt = _app_cfg['images_cnt']
1463
+ videos_cnt = _app_cfg['videos_cnt']
1464
+ _question = _chat_bot[-1][0]
1465
+ _chat_bot = _chat_bot[:-1]
1466
+ _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
1467
+ files_cnts = check_has_videos(_question)
1468
+ images_cnt -= files_cnts[0]
1469
+ videos_cnt -= files_cnts[1]
1470
+ _app_cfg['images_cnt'] = images_cnt
1471
+ _app_cfg['videos_cnt'] = videos_cnt
1472
+
1473
+ print(f"[regenerate] About to call respond with streaming_mode: {streaming_mode}")
1474
+ for result in respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1475
+ new_input, _chat_bot, _app_cfg, _stop_button = result
1476
+ _question = new_input
1477
+ yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
1478
+ else:
1479
+ # 在 tuples 格式下,_chat_bot[-1][0] 是字符串
1480
+ last_user_message = _chat_bot[-1][0]
1481
+ last_image = None
1482
+
1483
+ # 检查消息是否包含图片标识
1484
+ if "[Image uploaded]" in last_user_message:
1485
+ # 从消息中提取实际的用户消息
1486
+ last_user_message = last_user_message.replace("[Image uploaded] ", "")
1487
+ # 注意:在简化的 tuples 格式下,我们无法直接获取图片文件
1488
+ # 这里需要根据实际需要进行处理
1489
+ _chat_bot = _chat_bot[:-1]
1490
+ _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
1491
+
1492
+ print(f"[regenerate] About to call fewshot_respond with streaming_mode: {streaming_mode}")
1493
+ for result in fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1494
+ _image, _user_message, _assistant_message, _chat_bot, _app_cfg = result
1495
+ yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
1496
+
1497
+
1498
+ def flushed():
1499
+ return gr.update(interactive=True)
1500
+
1501
+
1502
+ def clear_media_cache(session_id):
1503
+ """清理指定会话的媒体缓存"""
1504
+ import shutil
1505
+ from pathlib import Path
1506
+
1507
+ try:
1508
+ cache_dir = Path("./media_cache") / session_id
1509
+ if cache_dir.exists():
1510
+ shutil.rmtree(cache_dir)
1511
+ print(f"[clear_media_cache] 已清理会话 {session_id} 的媒体缓存")
1512
+ except Exception as e:
1513
+ print(f"[clear_media_cache] 清理缓存失败: {e}")
1514
+
1515
+
1516
+ def clear(txt_input, file_upload, chat_bot, app_session):
1517
+ # 清理旧会话的媒体缓存
1518
+ if 'session_id' in app_session:
1519
+ clear_media_cache(app_session['session_id'])
1520
+
1521
+ chat_bot = copy.deepcopy(init_conversation)
1522
+ app_session['sts'] = None
1523
+ app_session['ctx'] = []
1524
+ app_session['images_cnt'] = 0
1525
+ app_session['videos_cnt'] = 0
1526
+ app_session['stop_streaming'] = False
1527
+ app_session['is_streaming'] = False
1528
+ app_session['media_cache'] = [] # 清空媒体缓存信息
1529
+ app_session['last_thinking_mode'] = False # 重置thinking模式状态
1530
+ app_session['session_id'] = uuid.uuid4().hex[:16]
1531
+ print(f"[会话] 生成新会话ID: {app_session['session_id']}")
1532
+ return "", None, gr.update(value=[], visible=False), gr.update(value=[], visible=False), chat_bot, app_session, None, '', ''
1533
+
1534
+
1535
+ def select_chat_type(_tab, _app_cfg):
1536
+ _app_cfg["chat_type"] = _tab
1537
+ return _app_cfg
1538
+
1539
+
1540
+ # UI配置
1541
+ form_radio = {
1542
+ 'choices': ['Beam Search', 'Sampling'],
1543
+ 'value': 'Sampling',
1544
+ 'interactive': True,
1545
+ 'label': 'Decode Type'
1546
+ }
1547
+
1548
+ thinking_checkbox = {
1549
+ 'value': False,
1550
+ 'interactive': True,
1551
+ 'label': 'Enable Thinking Mode',
1552
+ }
1553
+
1554
+ streaming_checkbox = {
1555
+ 'value': True,
1556
+ 'interactive': True,
1557
+ 'label': 'Enable Streaming Mode',
1558
+ }
1559
+
1560
+ fps_slider = {
1561
+ 'minimum': 1,
1562
+ 'maximum': 20,
1563
+ 'value': 3,
1564
+ 'step': 1,
1565
+ 'interactive': True,
1566
+ 'label': 'Custom FPS for Video Processing'
1567
+ }
1568
+
1569
+ init_conversation = [
1570
+ ["", "You can talk to me now"]
1571
+ ]
1572
+
1573
+ css = """
1574
+ video { height: auto !important; }
1575
+ .example label { font-size: 16px;}
1576
+
1577
+ /* Current Media Gallery 滚动条样式 - 使用class选择器更安全 */
1578
+ .current-media-gallery {
1579
+ overflow-y: auto !important;
1580
+ max-height: 600px !important;
1581
+ position: relative !important;
1582
+ }
1583
+
1584
+ /* 确保只影响特定的Gallery容器内部 */
1585
+ .current-media-gallery > div,
1586
+ .current-media-gallery .gallery-container {
1587
+ overflow-y: auto !important;
1588
+ max-height: 580px !important;
1589
+ }
1590
+
1591
+ .current-media-gallery .gallery-item {
1592
+ margin-bottom: 10px !important;
1593
+ }
1594
+
1595
+ /* 只为Current Media Gallery自定义滚动条样式 */
1596
+ .current-media-gallery::-webkit-scrollbar,
1597
+ .current-media-gallery > div::-webkit-scrollbar,
1598
+ .current-media-gallery .gallery-container::-webkit-scrollbar {
1599
+ width: 8px !important;
1600
+ }
1601
+
1602
+ .current-media-gallery::-webkit-scrollbar-track,
1603
+ .current-media-gallery > div::-webkit-scrollbar-track,
1604
+ .current-media-gallery .gallery-container::-webkit-scrollbar-track {
1605
+ background: #f1f1f1 !important;
1606
+ border-radius: 4px !important;
1607
+ }
1608
+
1609
+ .current-media-gallery::-webkit-scrollbar-thumb,
1610
+ .current-media-gallery > div::-webkit-scrollbar-thumb,
1611
+ .current-media-gallery .gallery-container::-webkit-scrollbar-thumb {
1612
+ background: #c1c1c1 !important;
1613
+ border-radius: 4px !important;
1614
+ }
1615
+
1616
+ .current-media-gallery::-webkit-scrollbar-thumb:hover,
1617
+ .current-media-gallery > div::-webkit-scrollbar-thumb:hover,
1618
+ .current-media-gallery .gallery-container::-webkit-scrollbar-thumb:hover {
1619
+ background: #a8a8a8 !important;
1620
+ }
1621
+
1622
+ /* 隐藏Current Media的不必要元素 */
1623
+ .current-media-gallery .upload-container,
1624
+ .current-media-gallery .drop-zone,
1625
+ .current-media-gallery .file-upload,
1626
+ .current-media-gallery .upload-text,
1627
+ .current-media-gallery .drop-text {
1628
+ display: none !important;
1629
+ }
1630
+
1631
+ .current-media-gallery .clear-button,
1632
+ .current-media-gallery .delete-button,
1633
+ .current-media-gallery .remove-button {
1634
+ display: none !important;
1635
+ }
1636
+
1637
+ /* 当Gallery为空时隐藏标签和占位文本 */
1638
+ .current-media-gallery:not([style*="display: none"]) .gallery-container:empty::after {
1639
+ content: "";
1640
+ display: none;
1641
+ }
1642
+
1643
+ .current-media-gallery .empty-gallery-text,
1644
+ .current-media-gallery .placeholder-text {
1645
+ display: none !important;
1646
+ }
1647
+
1648
+ /* 确保滚动条不会影响到其他组件 */
1649
+ .current-media-gallery {
1650
+ isolation: isolate !important;
1651
+ }
1652
+
1653
+ /* 重置其他Gallery组件的滚动条样式,防止被污染 */
1654
+ .gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar {
1655
+ width: initial !important;
1656
+ }
1657
+
1658
+ .gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-track {
1659
+ background: initial !important;
1660
+ border-radius: initial !important;
1661
+ }
1662
+
1663
+ .gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-thumb {
1664
+ background: initial !important;
1665
+ border-radius: initial !important;
1666
+ }
1667
+
1668
+ /* 确保chatbot不受影响 */
1669
+ .thinking-chatbot::-webkit-scrollbar {
1670
+ width: initial !important;
1671
+ }
1672
+
1673
+ .thinking-chatbot::-webkit-scrollbar-track {
1674
+ background: initial !important;
1675
+ }
1676
+
1677
+ .thinking-chatbot::-webkit-scrollbar-thumb {
1678
+ background: initial !important;
1679
+ }
1680
+
1681
+ /* 思考过程和正式回答的样式 */
1682
+ .response-container {
1683
+ margin: 10px 0;
1684
+ }
1685
+
1686
+ .thinking-section {
1687
+ background: linear-gradient(135deg, #f8f9ff 0%, #f0f4ff 100%);
1688
+ border: 1px solid #d1d9ff;
1689
+ border-radius: 12px;
1690
+ padding: 16px;
1691
+ margin-bottom: 0px;
1692
+ box-shadow: 0 2px 8px rgba(67, 90, 235, 0.1);
1693
+ }
1694
+
1695
+ .thinking-header {
1696
+ font-weight: 600;
1697
+ color: #4c5aa3;
1698
+ font-size: 14px;
1699
+ margin-bottom: 12px;
1700
+ display: flex;
1701
+ align-items: center;
1702
+ gap: 8px;
1703
+ }
1704
+
1705
+ .thinking-content {
1706
+ color: #5a6ba8;
1707
+ font-size: 13px;
1708
+ line-height: 1;
1709
+ font-style: italic;
1710
+ background: rgba(255, 255, 255, 0.6);
1711
+ padding: 12px;
1712
+ border-radius: 8px;
1713
+ border-left: 3px solid #4c5aa3;
1714
+ white-space: pre-wrap;
1715
+ }
1716
+
1717
+ .formal-section {
1718
+ background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%);
1719
+ border: 1px solid #e9ecef;
1720
+ border-radius: 12px;
1721
+ padding: 16px;
1722
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
1723
+ }
1724
+
1725
+ .formal-header {
1726
+ font-weight: 600;
1727
+ color: #28a745;
1728
+ font-size: 14px;
1729
+ margin-bottom: 12px;
1730
+ display: flex;
1731
+ align-items: center;
1732
+ gap: 8px;
1733
+ }
1734
+
1735
+ .formal-content {
1736
+ color: #333;
1737
+ font-size: 14px;
1738
+ line-height: 1;
1739
+ white-space: pre-wrap;
1740
+ }
1741
+
1742
+ /* 聊天机器人容器样式 */
1743
+ .thinking-chatbot .message {
1744
+ border-radius: 12px;
1745
+ overflow: visible;
1746
+ margin-top: 0 !important;
1747
+ margin-bottom: 0 !important;
1748
+ }
1749
+
1750
+ .thinking-chatbot .message-wrap {
1751
+ margin-top: 0 !important;
1752
+ margin-bottom: 0 !important;
1753
+ }
1754
+
1755
+ .thinking-chatbot .message.bot {
1756
+ background: transparent !important;
1757
+ border: none !important;
1758
+ padding: 8px !important;
1759
+ }
1760
+
1761
+ .thinking-chatbot .message.bot .content {
1762
+ background: transparent !important;
1763
+ }
1764
+ """
1765
+
1766
+ introduction = """
1767
+ ## Features:
1768
+ 1. Chat with single image
1769
+ 2. Chat with multiple images
1770
+ 3. Chat with video
1771
+ 4. Streaming Mode: Real-time response streaming
1772
+ 5. Thinking Mode: Show model reasoning process
1773
+
1774
+ Click `How to use` tab to see examples.
1775
+ """
1776
+
1777
+
1778
+ # 主应用
1779
+ def create_app():
1780
+ with gr.Blocks(css=css) as demo:
1781
+ with gr.Tab(model_name):
1782
+ with gr.Row():
1783
+ with gr.Column(scale=1, min_width=300):
1784
+ gr.Markdown(value=introduction)
1785
+ params_form = create_component(form_radio, comp='Radio')
1786
+ thinking_mode = create_component(thinking_checkbox, comp='Checkbox')
1787
+ streaming_mode = create_component(streaming_checkbox, comp='Checkbox')
1788
+
1789
+ fps_setting = create_component(fps_slider, comp='Slider')
1790
+ regenerate = create_component({'value': 'Regenerate'}, comp='Button')
1791
+ clear_button = create_component({'value': 'Clear History'}, comp='Button')
1792
+
1793
+ stop_button = gr.Button("Stop", visible=False)
1794
+
1795
+ with gr.Column(scale=3, min_width=500):
1796
+ initial_session_id = uuid.uuid4().hex[:16]
1797
+ print(f"[会话] 初始化会话,生成session_id: {initial_session_id}")
1798
+ app_session = gr.State({
1799
+ 'sts': None, 'ctx': [], 'images_cnt': 0, 'videos_cnt': 0,
1800
+ 'chat_type': 'Chat', 'stop_streaming': False, 'is_streaming': False,
1801
+ 'session_id': initial_session_id, 'media_cache': [], 'last_thinking_mode': False
1802
+ })
1803
+ with gr.Row():
1804
+ with gr.Column(scale=4):
1805
+ chat_bot = gr.Chatbot(
1806
+ label=f"Chat with {model_name}",
1807
+ value=copy.deepcopy(init_conversation),
1808
+ height=600,
1809
+ elem_classes="thinking-chatbot"
1810
+ )
1811
+ with gr.Column(scale=1, min_width=200):
1812
+ current_images = gr.Gallery(
1813
+ label="Current Media",
1814
+ show_label=True,
1815
+ elem_id="current_media",
1816
+ elem_classes="current-media-gallery",
1817
+ columns=1,
1818
+ rows=1, # 设为1行,让内容可以垂直滚动
1819
+ height=600,
1820
+ visible=False,
1821
+ container=True, # 启用容器模式
1822
+ allow_preview=True, # 允许预览
1823
+ show_download_button=False, # 隐藏下载按钮
1824
+ interactive=False, # 禁用交互,防止用户上传/删除
1825
+ show_share_button=False # 隐藏分享按钮
1826
+ )
1827
+
1828
+ with gr.Tab("Chat") as chat_tab:
1829
+ chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
1830
+
1831
+ with gr.Row():
1832
+ with gr.Column(scale=4):
1833
+ txt_input = gr.Textbox(
1834
+ placeholder="Type your message here...",
1835
+ label="Message",
1836
+ lines=2
1837
+ )
1838
+ with gr.Column(scale=1):
1839
+ submit_btn = gr.Button("Submit", variant="primary")
1840
+
1841
+ with gr.Row():
1842
+ with gr.Column():
1843
+ file_upload = create_multimodal_input()
1844
+ # 添加图片预览组件
1845
+ file_preview = gr.Gallery(
1846
+ label="Uploaded Files Preview",
1847
+ show_label=True,
1848
+ elem_id="file_preview",
1849
+ columns=3,
1850
+ rows=2,
1851
+ height="auto",
1852
+ visible=False
1853
+ )
1854
+
1855
+ # 添加文件上传时的预览更新
1856
+ def update_file_preview(files):
1857
+ if files:
1858
+ # 过滤出图片文件进行预览
1859
+ image_files = []
1860
+ for file in files:
1861
+ if hasattr(file, 'name'):
1862
+ file_path = file.name
1863
+ else:
1864
+ file_path = str(file)
1865
+
1866
+ # 检查是否是图片文件
1867
+ if any(file_path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']):
1868
+ image_files.append(file_path)
1869
+
1870
+ if image_files:
1871
+ return gr.update(value=image_files, visible=True)
1872
+
1873
+ return gr.update(value=[], visible=False)
1874
+
1875
+ file_upload.change(
1876
+ update_file_preview,
1877
+ inputs=[file_upload],
1878
+ outputs=[file_preview]
1879
+ )
1880
+
1881
+ # 创建一个包装函数来处理新的输入格式
1882
+ def handle_submit(message, files, chat_bot, current_images_gallery, app_session, params_form, thinking_mode, streaming_mode, fps_setting):
1883
+ print(f"[handle_submit] 收到输入: message='{message}', files={files}, chat_bot长度={len(chat_bot)}")
1884
+
1885
+ # 如果消息为空且没有文件,直接返回
1886
+ if not message and not files:
1887
+ print("[handle_submit] 消息和文件都为空,直接返回")
1888
+ return message, files, chat_bot, current_images_gallery, app_session, gr.update(visible=False)
1889
+
1890
+ # 模拟原来的 MultimodalInput 格式
1891
+ class MockInput:
1892
+ def __init__(self, text, files):
1893
+ self.text = text
1894
+ self.files = files if files else []
1895
+
1896
+ mock_question = MockInput(message, files)
1897
+ print(f"[handle_submit] 创建MockInput: text='{mock_question.text}', files={len(mock_question.files)}")
1898
+
1899
+ # respond 函数返回生成器,我们需要逐步yield结果
1900
+ result_generator = respond(mock_question, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting)
1901
+
1902
+ # 如果是生成器,逐步yield
1903
+ if hasattr(result_generator, '__iter__') and not isinstance(result_generator, (str, bytes, tuple)):
1904
+ print("[handle_submit] 使用生成器模式")
1905
+ for result in result_generator:
1906
+ new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result
1907
+ print(f"[handle_submit] yield结果: chat_bot长度={len(updated_chat_bot)}")
1908
+
1909
+ # 更新媒体显示
1910
+ media_gallery_update = update_media_gallery(updated_app_session)
1911
+
1912
+ # 返回正确的输出格式
1913
+ yield "", None, updated_chat_bot, media_gallery_update, updated_app_session, stop_btn_update
1914
+ else:
1915
+ print("[handle_submit] 使用非生成器模式")
1916
+ # 如果不是生成器,直接返回
1917
+ new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result_generator
1918
+ print(f"[handle_submit] 直接返回结果: chat_bot长度={len(updated_chat_bot)}")
1919
+
1920
+ # 更新图片显示
1921
+ image_gallery_update = update_image_gallery(updated_app_session)
1922
+
1923
+ yield "", None, updated_chat_bot, image_gallery_update, updated_app_session, stop_btn_update
1924
+
1925
+ submit_btn.click(
1926
+ handle_submit,
1927
+ [txt_input, file_upload, chat_bot, current_images, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
1928
+ [txt_input, file_upload, chat_bot, current_images, app_session, stop_button]
1929
+ )
1930
+
1931
+ with gr.Tab("Few Shot", visible=False) as fewshot_tab:
1932
+ fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
1933
+ with gr.Row():
1934
+ with gr.Column(scale=1):
1935
+ image_input = gr.Image(type="filepath", sources=["upload"])
1936
+ with gr.Column(scale=3):
1937
+ user_message = gr.Textbox(label="User")
1938
+ assistant_message = gr.Textbox(label="Assistant")
1939
+ with gr.Row():
1940
+ add_demonstration_button = gr.Button("Add Example")
1941
+ generate_button = gr.Button(value="Generate", variant="primary")
1942
+
1943
+ add_demonstration_button.click(
1944
+ fewshot_add_demonstration,
1945
+ [image_input, user_message, assistant_message, chat_bot, app_session],
1946
+ [image_input, user_message, assistant_message, chat_bot, app_session]
1947
+ )
1948
+ generate_button.click(
1949
+ fewshot_respond,
1950
+ [image_input, user_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
1951
+ [image_input, user_message, assistant_message, chat_bot, app_session]
1952
+ )
1953
+
1954
+ chat_tab.select(
1955
+ select_chat_type,
1956
+ [chat_tab_label, app_session],
1957
+ [app_session]
1958
+ )
1959
+ chat_tab.select(
1960
+ clear,
1961
+ [txt_input, file_upload, chat_bot, app_session],
1962
+ [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
1963
+ )
1964
+ fewshot_tab.select(
1965
+ select_chat_type,
1966
+ [fewshot_tab_label, app_session],
1967
+ [app_session]
1968
+ )
1969
+ fewshot_tab.select(
1970
+ clear,
1971
+ [txt_input, file_upload, chat_bot, app_session],
1972
+ [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
1973
+ )
1974
+ # chat_bot.flushed(flushed, outputs=[txt_input]) # 标准 Chatbot 可能不支持 flushed
1975
+
1976
+ params_form.change(
1977
+ update_streaming_mode_state,
1978
+ inputs=[params_form],
1979
+ outputs=[streaming_mode]
1980
+ )
1981
+
1982
+ regenerate.click(
1983
+ regenerate_button_clicked,
1984
+ [txt_input, image_input, user_message, assistant_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
1985
+ [txt_input, image_input, user_message, assistant_message, chat_bot, app_session]
1986
+ )
1987
+ clear_button.click(
1988
+ clear,
1989
+ [txt_input, file_upload, chat_bot, app_session],
1990
+ [txt_input, file_upload, file_preview, current_images, chat_bot, app_session, image_input, user_message, assistant_message]
1991
+ )
1992
+
1993
+ stop_button.click(
1994
+ stop_button_clicked,
1995
+ [app_session],
1996
+ [app_session, stop_button]
1997
+ )
1998
+
1999
+ return demo
2000
+
2001
+
2002
+ if __name__ == "__main__":
2003
+ # 解析命令行参数
2004
+ parser = argparse.ArgumentParser(description='Web Demo for MiniCPM-V 4.5')
2005
+ parser.add_argument('--port', type=int, default=7860, help='Port to run the web demo on')
2006
+ parser.add_argument('--no-parallel-encoding', action='store_true', help='Disable parallel image encoding')
2007
+ parser.add_argument('--parallel-processes', type=int, default=None, help='Number of parallel processes for image encoding')
2008
+ args = parser.parse_args()
2009
+
2010
+ # 配置并行编码
2011
+ if args.no_parallel_encoding:
2012
+ ENABLE_PARALLEL_ENCODING = False
2013
+ print("[性能优化] 并行图像编码已禁用")
2014
+ else:
2015
+ ENABLE_PARALLEL_ENCODING = True
2016
+ print("[性能优化] 并行图像编码已启用")
2017
+
2018
+ if args.parallel_processes:
2019
+ PARALLEL_PROCESSES = args.parallel_processes
2020
+ print(f"[性能优化] 设置并行进程数为: {PARALLEL_PROCESSES}")
2021
+ else:
2022
+ print(f"[性能优化] 自动检测并行进程数,CPU核心数: {mp.cpu_count()}")
2023
+
2024
+ # 初始化模型
2025
+ initialize_model()
2026
+
2027
+ # 创建并启动应用
2028
+ demo = create_app()
2029
+ demo.launch(
2030
+ share=False,
2031
+ debug=True,
2032
+ show_api=False,
2033
+ server_port=args.port,
2034
+ server_name="0.0.0.0"
2035
+ )
logging_util.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ import os
4
+
5
+ # setup root logger
6
+ def setup_root_logger(log_level=logging.INFO, dist_rank=0, local_dir=''):
7
+ """
8
+ log_level: logging level
9
+ dist_rank: process rank for distributed training
10
+ local_dir: local log path, default None
11
+ """
12
+ logger = logging.getLogger() # setup root logger for all
13
+ for handler in logger.handlers:
14
+ logger.removeHandler(handler)
15
+ # create formatter
16
+ fmt = '[%(asctime)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
17
+ #color_fmt = colored('[%(asctime)s]', 'green') + \
18
+ # colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
19
+
20
+ # create console handlers for master process
21
+ if dist_rank == 0:
22
+ console_handler = logging.StreamHandler(sys.stdout)
23
+ console_handler.setFormatter(
24
+ logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
25
+ logger.addHandler(console_handler)
26
+
27
+ # create file handlers
28
+ if local_dir:
29
+ os.makedirs(local_dir, exist_ok=True)
30
+ file_handler = logging.FileHandler(os.path.join(local_dir, f'log_rank{dist_rank}.log'), mode='a')
31
+ file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
32
+ logger.addHandler(file_handler)
33
+
34
+ logger.setLevel(log_level)
models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .minicpmv4_5 import ModelMiniCPMV4_5
models/minicpmv4_5.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from io import BytesIO
3
+ import torch
4
+ from PIL import Image
5
+ import base64
6
+ import json
7
+ import re
8
+ import logging
9
+ from transformers import AutoModel, AutoTokenizer, AutoProcessor, set_seed
10
+ # set_seed(42)
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class ModelMiniCPMV4_5:
15
+ def __init__(self, path) -> None:
16
+ self.model = AutoModel.from_pretrained(
17
+ path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16, device_map="auto")
18
+ self.model.eval()
19
+ self.tokenizer = AutoTokenizer.from_pretrained(
20
+ path, trust_remote_code=True)
21
+ self.processor = AutoProcessor.from_pretrained(
22
+ path, trust_remote_code=True)
23
+
24
+ def __call__(self, input_data):
25
+ image = None
26
+ if "image" in input_data and len(input_data["image"]) > 10:
27
+ image = Image.open(BytesIO(base64.b64decode(
28
+ input_data["image"]))).convert('RGB')
29
+
30
+ msgs = input_data["question"]
31
+ params = input_data.get("params", "{}")
32
+ params = json.loads(params)
33
+ msgs = json.loads(msgs)
34
+
35
+ temporal_ids = input_data.get("temporal_ids", None)
36
+ if temporal_ids:
37
+ temporal_ids = json.loads(temporal_ids)
38
+
39
+ if params.get("max_new_tokens", 0) > 16384:
40
+ logger.info(f"make max_new_tokens=16384, reducing limit to save memory")
41
+ params["max_new_tokens"] = 16384
42
+ if params.get("max_inp_length", 0) > 2048 * 10:
43
+ logger.info(f"make max_inp_length={2048 * 10}, keeping high limit for video processing")
44
+ params["max_inp_length"] = 2048 * 10
45
+
46
+ for msg in msgs:
47
+ if 'content' in msg:
48
+ contents = msg['content']
49
+ else:
50
+ contents = msg.pop('contents')
51
+
52
+ new_cnts = []
53
+ for c in contents:
54
+ if isinstance(c, dict):
55
+ if c['type'] == 'text':
56
+ c = c['pairs']
57
+ elif c['type'] == 'image':
58
+ c = Image.open(
59
+ BytesIO(base64.b64decode(c["pairs"]))).convert('RGB')
60
+ else:
61
+ raise ValueError(
62
+ "contents type only support text and image.")
63
+ new_cnts.append(c)
64
+ msg['content'] = new_cnts
65
+ logger.info(f'msgs: {str(msgs)}')
66
+
67
+ enable_thinking = params.pop('enable_thinking', True)
68
+ is_streaming = params.pop('stream', False)
69
+
70
+ if is_streaming:
71
+ return self._stream_chat(image, msgs, enable_thinking, params, temporal_ids)
72
+ else:
73
+ chat_kwargs = {
74
+ "image": image,
75
+ "msgs": msgs,
76
+ "tokenizer": self.tokenizer,
77
+ "processor": self.processor,
78
+ "enable_thinking": enable_thinking,
79
+ **params
80
+ }
81
+
82
+ if temporal_ids is not None:
83
+ chat_kwargs["temporal_ids"] = temporal_ids
84
+
85
+ answer = self.model.chat(**chat_kwargs)
86
+
87
+ res = re.sub(r'(<box>.*</box>)', '', answer)
88
+ res = res.replace('<ref>', '')
89
+ res = res.replace('</ref>', '')
90
+ res = res.replace('<box>', '')
91
+ answer = res.replace('</box>', '')
92
+ if not enable_thinking:
93
+ print(f"enable_thinking: {enable_thinking}")
94
+ answer = answer.replace('</think>', '')
95
+
96
+ oids = self.tokenizer.encode(answer)
97
+ output_tokens = len(oids)
98
+ return answer, output_tokens
99
+
100
+ def _stream_chat(self, image, msgs, enable_thinking, params, temporal_ids=None):
101
+ try:
102
+ params['stream'] = True
103
+ chat_kwargs = {
104
+ "image": image,
105
+ "msgs": msgs,
106
+ "tokenizer": self.tokenizer,
107
+ "processor": self.processor,
108
+ "enable_thinking": enable_thinking,
109
+ **params
110
+ }
111
+ if temporal_ids is not None:
112
+ chat_kwargs["temporal_ids"] = temporal_ids
113
+
114
+ answer_generator = self.model.chat(**chat_kwargs)
115
+
116
+ if not hasattr(answer_generator, '__iter__'):
117
+ answer = answer_generator
118
+ res = re.sub(r'(<box>.*</box>)', '', answer)
119
+ res = res.replace('<ref>', '')
120
+ res = res.replace('</ref>', '')
121
+ res = res.replace('<box>', '')
122
+ answer = res.replace('</box>', '')
123
+ if not enable_thinking:
124
+ answer = answer.replace('</think>', '')
125
+
126
+ char_count = 0
127
+ for char in answer:
128
+ yield char
129
+ char_count += 1
130
+ else:
131
+ full_answer = ""
132
+ chunk_count = 0
133
+ char_count = 0
134
+
135
+ for chunk in answer_generator:
136
+ if isinstance(chunk, str):
137
+ clean_chunk = re.sub(r'(<box>.*</box>)', '', chunk)
138
+ clean_chunk = clean_chunk.replace('<ref>', '')
139
+ clean_chunk = clean_chunk.replace('</ref>', '')
140
+ clean_chunk = clean_chunk.replace('<box>', '')
141
+ clean_chunk = clean_chunk.replace('</box>', '')
142
+
143
+ if not enable_thinking:
144
+ clean_chunk = clean_chunk.replace('</think>', '')
145
+
146
+ full_answer += chunk
147
+ char_count += len(clean_chunk)
148
+ chunk_count += 1
149
+ yield clean_chunk
150
+ else:
151
+ full_answer += str(chunk)
152
+ char_count += len(str(chunk))
153
+ chunk_count += 1
154
+ yield str(chunk)
155
+
156
+ except Exception as e:
157
+ logger.error(f"Stream chat error: {e}")
158
+ yield f"Error: {str(e)}"
requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ # Core dependencies
3
+ spaces
4
+ gradio==4.44.1
5
+ torch==2.7.1
6
+ torchvision==0.22.1
7
+ numpy==2.2.6
8
+ pillow
9
+ scipy
10
+ pandas==2.3.1
11
+
12
+ # ML/AI dependencies
13
+ transformers==4.55.0
14
+ accelerate==1.9.0
15
+ einops==0.8.1
16
+ timm==1.0.19
17
+ safetensors==0.5.3
18
+ tokenizers==0.21.4
19
+ huggingface-hub==0.34.3
20
+
21
+ # Video processing
22
+ decord==0.6.0
23
+ ffmpy==0.6.1
24
+ pydub==0.25.1
25
+
26
+ # Utilities
27
+ requests==2.32.4
28
+ tqdm==4.67.1
29
+ PyYAML==6.0.2
30
+ psutil==7.0.0
31
+ pydantic==2.10.6
32
+
33
+ # Visualization
34
+ matplotlib==3.10.5
35
+
36
+ # Optional: For CUDA support (if needed)
37
+ # triton==3.3.1