Spaces:
Running
Running
Upload 5 files
Browse files- app.py +2035 -0
- logging_util.py +34 -0
- models/__init__.py +1 -0
- models/minicpmv4_5.py +158 -0
- requirements.txt +37 -0
app.py
ADDED
|
@@ -0,0 +1,2035 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import uuid
|
| 4 |
+
import time
|
| 5 |
+
import copy
|
| 6 |
+
import base64
|
| 7 |
+
import logging
|
| 8 |
+
import argparse
|
| 9 |
+
import math
|
| 10 |
+
import multiprocessing as mp
|
| 11 |
+
from io import BytesIO
|
| 12 |
+
from typing import Generator, Any, Dict, Optional
|
| 13 |
+
|
| 14 |
+
import spaces
|
| 15 |
+
import torch
|
| 16 |
+
import gradio as gr
|
| 17 |
+
import numpy as np
|
| 18 |
+
from PIL import Image
|
| 19 |
+
from decord import VideoReader, cpu
|
| 20 |
+
from scipy.spatial import cKDTree
|
| 21 |
+
# import modelscope_studio as mgr
|
| 22 |
+
|
| 23 |
+
# 导入模型相关模块
|
| 24 |
+
try:
|
| 25 |
+
from models import ModelMiniCPMV4_5
|
| 26 |
+
except ImportError:
|
| 27 |
+
print("Warning: models module not found. Please ensure models.py is available.")
|
| 28 |
+
class ModelMiniCPMV4_5:
|
| 29 |
+
def __init__(self, model_path):
|
| 30 |
+
self.model_path = model_path
|
| 31 |
+
self.model = None
|
| 32 |
+
|
| 33 |
+
def __call__(self, query):
|
| 34 |
+
return "Model not loaded", 0
|
| 35 |
+
|
| 36 |
+
# 全局配置
|
| 37 |
+
ERROR_MSG = "Error, please retry"
|
| 38 |
+
model_name = 'MiniCPM-V 4.5'
|
| 39 |
+
disable_text_only = False # 允许纯文本消息,便于测试
|
| 40 |
+
DOUBLE_FRAME_DURATION = 30
|
| 41 |
+
MAX_NUM_FRAMES = 180
|
| 42 |
+
MAX_NUM_PACKING = 3
|
| 43 |
+
TIME_SCALE = 0.1
|
| 44 |
+
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
|
| 45 |
+
VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
|
| 46 |
+
|
| 47 |
+
ENABLE_PARALLEL_ENCODING = True
|
| 48 |
+
PARALLEL_PROCESSES = None
|
| 49 |
+
|
| 50 |
+
# 全局模型实例
|
| 51 |
+
global_model = None
|
| 52 |
+
|
| 53 |
+
# 日志配置
|
| 54 |
+
logging.basicConfig(level=logging.INFO)
|
| 55 |
+
logger = logging.getLogger(__name__)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# 全局模型配置
|
| 59 |
+
model_config = {
|
| 60 |
+
'model_path': None,
|
| 61 |
+
'model_type': None,
|
| 62 |
+
'instance_id': 0
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# 全局模型缓存(在GPU进程中)
|
| 66 |
+
_gpu_model_cache = None
|
| 67 |
+
|
| 68 |
+
def _initialize_gpu_model():
|
| 69 |
+
"""在GPU进程中获取模型并移到GPU"""
|
| 70 |
+
global _gpu_model_cache
|
| 71 |
+
if _gpu_model_cache is None:
|
| 72 |
+
logger.info(f"在GPU进程中初始化模型: {model_config['model_type']}")
|
| 73 |
+
|
| 74 |
+
match model_config['model_type'].lower():
|
| 75 |
+
case 'minicpmv4_5':
|
| 76 |
+
_gpu_model_cache = ModelMiniCPMV4_5(model_config['model_path'])
|
| 77 |
+
case _:
|
| 78 |
+
raise ValueError(f"Unsupported model type: {model_config['model_type']}")
|
| 79 |
+
|
| 80 |
+
logger.info(f"模型在CPU上初始化完成")
|
| 81 |
+
|
| 82 |
+
# 每次推理时将模型移到GPU
|
| 83 |
+
if hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'to'):
|
| 84 |
+
logger.info("将模型移到GPU...")
|
| 85 |
+
_gpu_model_cache.model.to('cuda')
|
| 86 |
+
elif hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'model') and hasattr(_gpu_model_cache.model.model, 'to'):
|
| 87 |
+
logger.info("将模型移到GPU(嵌套模型)...")
|
| 88 |
+
_gpu_model_cache.model.model.to('cuda')
|
| 89 |
+
|
| 90 |
+
return _gpu_model_cache
|
| 91 |
+
|
| 92 |
+
@spaces.GPU
|
| 93 |
+
def gpu_handler(query):
|
| 94 |
+
"""GPU推理处理器"""
|
| 95 |
+
model = _initialize_gpu_model()
|
| 96 |
+
|
| 97 |
+
res, output_tokens = model({
|
| 98 |
+
"image": query["image"],
|
| 99 |
+
"question": query["question"],
|
| 100 |
+
"params": query.get("params", "{}"),
|
| 101 |
+
"temporal_ids": query.get("temporal_ids", None)
|
| 102 |
+
})
|
| 103 |
+
return {
|
| 104 |
+
"result": res,
|
| 105 |
+
"usage": {"output_tokens": output_tokens}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
@spaces.GPU
|
| 109 |
+
def gpu_stream_handler(query):
|
| 110 |
+
"""GPU流式推理处理器"""
|
| 111 |
+
model = _initialize_gpu_model()
|
| 112 |
+
|
| 113 |
+
params = json.loads(query.get("params", "{}"))
|
| 114 |
+
params["stream"] = True
|
| 115 |
+
query["params"] = json.dumps(params)
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
generator = model({
|
| 119 |
+
"image": query["image"],
|
| 120 |
+
"question": query["question"],
|
| 121 |
+
"params": query["params"],
|
| 122 |
+
"temporal_ids": query.get("temporal_ids", None)
|
| 123 |
+
})
|
| 124 |
+
|
| 125 |
+
# 收集生成器的所有输出,避免序列化问题
|
| 126 |
+
full_response = ""
|
| 127 |
+
for chunk in generator:
|
| 128 |
+
full_response += chunk
|
| 129 |
+
|
| 130 |
+
return full_response
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"GPU stream handler error: {e}")
|
| 133 |
+
return f"Stream error: {str(e)}"
|
| 134 |
+
|
| 135 |
+
class Model:
|
| 136 |
+
"""模型封装类,不持有实际模型对象"""
|
| 137 |
+
|
| 138 |
+
def __init__(self, model_path: str, model_type: str, instance_id: int = 0):
|
| 139 |
+
self.instance_id = instance_id
|
| 140 |
+
self.model_path = model_path
|
| 141 |
+
self.model_type = model_type
|
| 142 |
+
|
| 143 |
+
# 设置全局配置
|
| 144 |
+
model_config['model_path'] = model_path
|
| 145 |
+
model_config['model_type'] = model_type
|
| 146 |
+
model_config['instance_id'] = instance_id
|
| 147 |
+
|
| 148 |
+
logger.info(f"实例 {instance_id}: 配置模型类型 {model_type}")
|
| 149 |
+
logger.info(f"实例 {instance_id}: 模型路径 {model_path}")
|
| 150 |
+
|
| 151 |
+
def handler(self, query):
|
| 152 |
+
"""非流式推理处理器"""
|
| 153 |
+
return gpu_handler(query)
|
| 154 |
+
|
| 155 |
+
def stream_handler(self, query):
|
| 156 |
+
"""流式推理处理器"""
|
| 157 |
+
return gpu_stream_handler(query)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def initialize_model():
|
| 161 |
+
"""初始化全局模型"""
|
| 162 |
+
global global_model, _gpu_model_cache
|
| 163 |
+
|
| 164 |
+
# 默认配置
|
| 165 |
+
model_path = os.getenv('MODEL_PATH', 'openbmb/MiniCPM-V-4_5')
|
| 166 |
+
model_type = os.getenv('MODEL_TYPE', 'minicpmv4_5')
|
| 167 |
+
|
| 168 |
+
logger.info(f"="*50)
|
| 169 |
+
logger.info(f"启动MiniCPM-V服务")
|
| 170 |
+
logger.info(f"模型路径: {model_path}")
|
| 171 |
+
logger.info(f"模型类型: {model_type}")
|
| 172 |
+
logger.info(f"="*50)
|
| 173 |
+
|
| 174 |
+
# 创建模型封装类
|
| 175 |
+
global_model = Model(model_path, model_type, 0)
|
| 176 |
+
|
| 177 |
+
# 在主进程中预加载模型到CPU(可选,为了更快的首次推理)
|
| 178 |
+
try:
|
| 179 |
+
logger.info("在主进程中预加载模型到CPU...")
|
| 180 |
+
match model_type.lower():
|
| 181 |
+
case 'minicpmv4_5':
|
| 182 |
+
_gpu_model_cache = ModelMiniCPMV4_5(model_path)
|
| 183 |
+
case _:
|
| 184 |
+
raise ValueError(f"Unsupported model type: {model_type}")
|
| 185 |
+
|
| 186 |
+
logger.info("模型在主进程CPU上预加载完成")
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.warning(f"主进程预加载模型失败,将在GPU进程中加载: {e}")
|
| 189 |
+
_gpu_model_cache = None
|
| 190 |
+
|
| 191 |
+
return global_model
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# 工具函数
|
| 195 |
+
def get_file_extension(filename):
|
| 196 |
+
return os.path.splitext(filename)[1].lower()
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def is_image(filename):
|
| 200 |
+
return get_file_extension(filename) in IMAGE_EXTENSIONS
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def is_video(filename):
|
| 204 |
+
return get_file_extension(filename) in VIDEO_EXTENSIONS
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def map_to_nearest_scale(values, scale):
|
| 208 |
+
tree = cKDTree(np.asarray(scale)[:, None])
|
| 209 |
+
_, indices = tree.query(np.asarray(values)[:, None])
|
| 210 |
+
return np.asarray(scale)[indices]
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def group_array(arr, size):
|
| 214 |
+
return [arr[i:i+size] for i in range(0, len(arr), size)]
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def encode_image(image):
|
| 218 |
+
"""编码单张图片"""
|
| 219 |
+
if not isinstance(image, Image.Image):
|
| 220 |
+
if hasattr(image, 'path'):
|
| 221 |
+
image = Image.open(image.path)
|
| 222 |
+
elif hasattr(image, 'file') and hasattr(image.file, 'path'):
|
| 223 |
+
image = Image.open(image.file.path)
|
| 224 |
+
elif hasattr(image, 'name'):
|
| 225 |
+
image = Image.open(image.name)
|
| 226 |
+
else:
|
| 227 |
+
image_path = getattr(image, 'url', getattr(image, 'orig_name', str(image)))
|
| 228 |
+
image = Image.open(image_path)
|
| 229 |
+
|
| 230 |
+
# 调整图片大小
|
| 231 |
+
max_size = 448*16
|
| 232 |
+
if max(image.size) > max_size:
|
| 233 |
+
w, h = image.size
|
| 234 |
+
if w > h:
|
| 235 |
+
new_w = max_size
|
| 236 |
+
new_h = int(h * max_size / w)
|
| 237 |
+
else:
|
| 238 |
+
new_h = max_size
|
| 239 |
+
new_w = int(w * max_size / h)
|
| 240 |
+
image = image.resize((new_w, new_h), resample=Image.BICUBIC)
|
| 241 |
+
|
| 242 |
+
# 转换为base64
|
| 243 |
+
buffered = BytesIO()
|
| 244 |
+
image.save(buffered, format="png")
|
| 245 |
+
im_b64 = base64.b64encode(buffered.getvalue()).decode()
|
| 246 |
+
return [{"type": "image", "pairs": im_b64}]
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def encode_image_parallel(image_data):
|
| 250 |
+
"""并行图片编码包装函数"""
|
| 251 |
+
try:
|
| 252 |
+
return encode_image(image_data)
|
| 253 |
+
except Exception as e:
|
| 254 |
+
print(f"[Parallel encoding error] Image encoding failed: {e}")
|
| 255 |
+
return None
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def encode_images_parallel(frames, num_processes=None):
|
| 259 |
+
"""多进程并行图片编码"""
|
| 260 |
+
if not ENABLE_PARALLEL_ENCODING:
|
| 261 |
+
print(f"[Parallel encoding] Parallel encoding disabled, using serial processing")
|
| 262 |
+
encoded_frames = []
|
| 263 |
+
for frame in frames:
|
| 264 |
+
encoded = encode_image(frame)
|
| 265 |
+
if encoded:
|
| 266 |
+
encoded_frames.extend(encoded)
|
| 267 |
+
return encoded_frames
|
| 268 |
+
|
| 269 |
+
if num_processes is None:
|
| 270 |
+
cpu_cores = mp.cpu_count()
|
| 271 |
+
if PARALLEL_PROCESSES:
|
| 272 |
+
num_processes = PARALLEL_PROCESSES
|
| 273 |
+
else:
|
| 274 |
+
if len(frames) >= 50:
|
| 275 |
+
num_processes = min(cpu_cores, len(frames), 32)
|
| 276 |
+
elif len(frames) >= 20:
|
| 277 |
+
num_processes = min(cpu_cores, len(frames), 16)
|
| 278 |
+
else:
|
| 279 |
+
num_processes = min(cpu_cores, len(frames), 8)
|
| 280 |
+
|
| 281 |
+
print(f"[Parallel encoding] Starting parallel encoding of {len(frames)} frame images, using {num_processes} processes")
|
| 282 |
+
|
| 283 |
+
if len(frames) <= 2:
|
| 284 |
+
print(f"[Parallel encoding] Few images ({len(frames)} frames), using serial processing")
|
| 285 |
+
encoded_frames = []
|
| 286 |
+
for frame in frames:
|
| 287 |
+
encoded = encode_image(frame)
|
| 288 |
+
if encoded:
|
| 289 |
+
encoded_frames.extend(encoded)
|
| 290 |
+
return encoded_frames
|
| 291 |
+
|
| 292 |
+
start_time = time.time()
|
| 293 |
+
try:
|
| 294 |
+
with mp.Pool(processes=num_processes) as pool:
|
| 295 |
+
results = pool.map(encode_image_parallel, frames)
|
| 296 |
+
|
| 297 |
+
encoded_frames = []
|
| 298 |
+
for result in results:
|
| 299 |
+
if result:
|
| 300 |
+
encoded_frames.extend(result)
|
| 301 |
+
|
| 302 |
+
total_time = time.time() - start_time
|
| 303 |
+
print(f"[Parallel encoding] Parallel encoding completed, total time: {total_time:.3f}s, encoded {len(encoded_frames)} images")
|
| 304 |
+
|
| 305 |
+
return encoded_frames
|
| 306 |
+
|
| 307 |
+
except Exception as e:
|
| 308 |
+
print(f"[Parallel encoding] Parallel processing failed, falling back to serial processing: {e}")
|
| 309 |
+
encoded_frames = []
|
| 310 |
+
for frame in frames:
|
| 311 |
+
encoded = encode_image(frame)
|
| 312 |
+
if encoded:
|
| 313 |
+
encoded_frames.extend(encoded)
|
| 314 |
+
return encoded_frames
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def encode_video(video, choose_fps=None):
|
| 318 |
+
"""编码视频文件"""
|
| 319 |
+
def uniform_sample(l, n):
|
| 320 |
+
gap = len(l) / n
|
| 321 |
+
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
| 322 |
+
return [l[i] for i in idxs]
|
| 323 |
+
|
| 324 |
+
if hasattr(video, 'path'):
|
| 325 |
+
video_path = video.path
|
| 326 |
+
elif hasattr(video, 'file') and hasattr(video.file, 'path'):
|
| 327 |
+
video_path = video.file.path
|
| 328 |
+
elif hasattr(video, 'name'):
|
| 329 |
+
video_path = video.name
|
| 330 |
+
else:
|
| 331 |
+
video_path = getattr(video, 'url', getattr(video, 'orig_name', str(video)))
|
| 332 |
+
|
| 333 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
| 334 |
+
fps = vr.get_avg_fps()
|
| 335 |
+
video_duration = len(vr) / fps
|
| 336 |
+
|
| 337 |
+
frame_idx = [i for i in range(0, len(vr))]
|
| 338 |
+
|
| 339 |
+
effective_fps = choose_fps if choose_fps else 1
|
| 340 |
+
|
| 341 |
+
if video_duration < DOUBLE_FRAME_DURATION and effective_fps <= 5:
|
| 342 |
+
effective_fps = effective_fps * 2
|
| 343 |
+
packing_nums = 2
|
| 344 |
+
choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
|
| 345 |
+
elif effective_fps * int(video_duration) <= MAX_NUM_FRAMES:
|
| 346 |
+
packing_nums = 1
|
| 347 |
+
choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
|
| 348 |
+
else:
|
| 349 |
+
packing_size = math.ceil(video_duration * effective_fps / MAX_NUM_FRAMES)
|
| 350 |
+
if packing_size <= MAX_NUM_PACKING:
|
| 351 |
+
choose_frames = round(video_duration * effective_fps)
|
| 352 |
+
packing_nums = packing_size
|
| 353 |
+
else:
|
| 354 |
+
choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
|
| 355 |
+
packing_nums = MAX_NUM_PACKING
|
| 356 |
+
|
| 357 |
+
choose_idx = choose_frames
|
| 358 |
+
|
| 359 |
+
frame_idx = np.array(uniform_sample(frame_idx, choose_idx))
|
| 360 |
+
frames = vr.get_batch(frame_idx).asnumpy()
|
| 361 |
+
|
| 362 |
+
frame_idx_ts = frame_idx / fps
|
| 363 |
+
scale = np.arange(0, video_duration, TIME_SCALE)
|
| 364 |
+
|
| 365 |
+
frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
|
| 366 |
+
frame_ts_id = frame_ts_id.astype(np.int32)
|
| 367 |
+
|
| 368 |
+
assert len(frames) == len(frame_ts_id)
|
| 369 |
+
|
| 370 |
+
frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
|
| 371 |
+
frame_ts_id_group = group_array(frame_ts_id.tolist(), packing_nums)
|
| 372 |
+
|
| 373 |
+
print(f"[Performance] Starting image encoding, total {len(frames)} frames")
|
| 374 |
+
|
| 375 |
+
if ENABLE_PARALLEL_ENCODING:
|
| 376 |
+
print(f"[Image encoding] Using multi-process parallel encoding, CPU cores: {mp.cpu_count()}")
|
| 377 |
+
encoded_frames = encode_images_parallel(frames, PARALLEL_PROCESSES)
|
| 378 |
+
else:
|
| 379 |
+
print("[Warning] Parallel encoding disabled, using serial processing")
|
| 380 |
+
encoded_frames = []
|
| 381 |
+
for frame in frames:
|
| 382 |
+
encoded = encode_image(frame)
|
| 383 |
+
if encoded:
|
| 384 |
+
encoded_frames.extend(encoded)
|
| 385 |
+
|
| 386 |
+
return encoded_frames, frame_ts_id_group
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
# 响应处理函数
|
| 390 |
+
def parse_thinking_response(response_text):
|
| 391 |
+
"""解析包含<think>标签的响应文本,支持流式解析"""
|
| 392 |
+
import re
|
| 393 |
+
|
| 394 |
+
# 完整的thinking标签匹配
|
| 395 |
+
complete_think_pattern = r'<think>(.*?)</think>'
|
| 396 |
+
thinking_matches = re.findall(complete_think_pattern, response_text, re.DOTALL)
|
| 397 |
+
|
| 398 |
+
if thinking_matches:
|
| 399 |
+
# 有完整的thinking标签
|
| 400 |
+
thinking_content = "\n\n".join(thinking_matches).strip()
|
| 401 |
+
print("thinking_content---:", thinking_content)
|
| 402 |
+
formal_answer = re.sub(complete_think_pattern, '', response_text, flags=re.DOTALL).strip()
|
| 403 |
+
return thinking_content, formal_answer
|
| 404 |
+
else:
|
| 405 |
+
# 检查是否有未完成的thinking标签
|
| 406 |
+
partial_think_match = re.search(r'<think>(.*?)$', response_text, re.DOTALL)
|
| 407 |
+
if partial_think_match:
|
| 408 |
+
# 有开始标签但没有结束标签,说明thinking内容正在输出中
|
| 409 |
+
# 返回特殊标识,表示正在thinking过程中
|
| 410 |
+
return "STREAMING", ""
|
| 411 |
+
else:
|
| 412 |
+
# 没有thinking标签,直接返回原文作为正式回答
|
| 413 |
+
return "", response_text.strip()
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def parse_thinking_response_for_final(response_text):
|
| 417 |
+
"""最终解析thinking响应,用于完成时的格式化"""
|
| 418 |
+
import re
|
| 419 |
+
|
| 420 |
+
# 首先尝试匹配完整的thinking标签
|
| 421 |
+
think_pattern = r'<think>(.*?)</think>'
|
| 422 |
+
thinking_matches = re.findall(think_pattern, response_text, re.DOTALL)
|
| 423 |
+
|
| 424 |
+
if thinking_matches:
|
| 425 |
+
thinking_content = "\n\n".join(thinking_matches).strip()
|
| 426 |
+
formal_answer = re.sub(think_pattern, '', response_text, flags=re.DOTALL).strip()
|
| 427 |
+
print(f"[parse_final] 找到完整thinking标签, thinking长度: {len(thinking_content)}, answer长度: {len(formal_answer)}")
|
| 428 |
+
else:
|
| 429 |
+
# 如果没有完整标签,检查是否有未闭合的<think>标签
|
| 430 |
+
if '<think>' in response_text:
|
| 431 |
+
think_start = response_text.find('<think>')
|
| 432 |
+
if think_start != -1:
|
| 433 |
+
# 提取thinking内容(从<think>之后到字符串结束)
|
| 434 |
+
thinking_content = response_text[think_start + 7:].strip() # 跳过<think>
|
| 435 |
+
# formal_answer是<think>之��的内容
|
| 436 |
+
formal_answer = response_text[:think_start].strip()
|
| 437 |
+
|
| 438 |
+
# 如果formal_answer为空,说明整个响应都是thinking内容
|
| 439 |
+
if not formal_answer:
|
| 440 |
+
formal_answer = "" # 没有正式回答
|
| 441 |
+
|
| 442 |
+
print(f"[parse_final] 找到未闭合thinking标签")
|
| 443 |
+
print(f"[parse_final] thinking内容: '{thinking_content[:100]}...'")
|
| 444 |
+
print(f"[parse_final] formal_answer: '{formal_answer[:100]}...'")
|
| 445 |
+
else:
|
| 446 |
+
thinking_content = ""
|
| 447 |
+
formal_answer = response_text.strip()
|
| 448 |
+
print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
|
| 449 |
+
else:
|
| 450 |
+
thinking_content = ""
|
| 451 |
+
formal_answer = response_text.strip()
|
| 452 |
+
print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
|
| 453 |
+
|
| 454 |
+
return thinking_content, formal_answer
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def normalize_text_for_html(text):
|
| 458 |
+
"""轻量级文本规范化"""
|
| 459 |
+
import re
|
| 460 |
+
|
| 461 |
+
if not text:
|
| 462 |
+
return ""
|
| 463 |
+
|
| 464 |
+
text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
|
| 465 |
+
lines = [line.strip() for line in text.split("\n")]
|
| 466 |
+
text = "\n".join(lines)
|
| 467 |
+
text = text.strip()
|
| 468 |
+
return text
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
def format_response_with_thinking(thinking_content, formal_answer):
|
| 472 |
+
"""格式化包含思考过程的响应"""
|
| 473 |
+
print(f"[format_thinking] thinking_content长度: {len(thinking_content) if thinking_content else 0}")
|
| 474 |
+
print(f"[format_thinking] formal_answer长度: {len(formal_answer) if formal_answer else 0}")
|
| 475 |
+
print(f"[format_thinking] thinking_content前100字符: '{thinking_content[:100] if thinking_content else 'None'}...'")
|
| 476 |
+
print(f"[format_thinking] formal_answer前100字符: '{formal_answer[:100] if formal_answer else 'None'}...'")
|
| 477 |
+
|
| 478 |
+
# 检查内容是否为空
|
| 479 |
+
if not thinking_content and not formal_answer:
|
| 480 |
+
print("[format_thinking] 警告:thinking_content和formal_answer都为空!")
|
| 481 |
+
elif not formal_answer:
|
| 482 |
+
print("[format_thinking] 警告:formal_answer为空!")
|
| 483 |
+
elif not thinking_content:
|
| 484 |
+
print("[format_thinking] 注意:thinking_content为空,将使用简化格式")
|
| 485 |
+
|
| 486 |
+
# 添加一个唯一的ID来强制前端重新渲染
|
| 487 |
+
import uuid
|
| 488 |
+
unique_id = uuid.uuid4().hex[:8]
|
| 489 |
+
|
| 490 |
+
# 如果有thinking内容,显示完整的thinking格式
|
| 491 |
+
if thinking_content and thinking_content.strip():
|
| 492 |
+
formatted_response = f"""
|
| 493 |
+
<div class="response-container" id="response-{unique_id}">
|
| 494 |
+
<div class="thinking-section">
|
| 495 |
+
<div class="thinking-header">🤔 think</div>
|
| 496 |
+
<div class="thinking-content">{thinking_content}</div>
|
| 497 |
+
</div>
|
| 498 |
+
<div class="formal-section">
|
| 499 |
+
<div class="formal-header">💡 answer</div>
|
| 500 |
+
<div class="formal-content">{formal_answer if formal_answer else '(无正式回答)'}</div>
|
| 501 |
+
</div>
|
| 502 |
+
</div>
|
| 503 |
+
"""
|
| 504 |
+
else:
|
| 505 |
+
# 如果没有thinking内容,直接显示回答
|
| 506 |
+
content_to_show = formal_answer if formal_answer and formal_answer.strip() else "(空回答)"
|
| 507 |
+
formatted_response = f"""
|
| 508 |
+
<div class="response-container" id="response-{unique_id}">
|
| 509 |
+
<div class="formal-section">
|
| 510 |
+
<div class="formal-content">{content_to_show}</div>
|
| 511 |
+
</div>
|
| 512 |
+
</div>
|
| 513 |
+
"""
|
| 514 |
+
|
| 515 |
+
return "\n" + formatted_response.strip() + "\n"
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def check_mm_type(mm_file):
|
| 519 |
+
"""检查多媒体文件类型"""
|
| 520 |
+
if hasattr(mm_file, 'path'):
|
| 521 |
+
path = mm_file.path
|
| 522 |
+
elif hasattr(mm_file, 'file') and hasattr(mm_file.file, 'path'):
|
| 523 |
+
path = mm_file.file.path
|
| 524 |
+
elif hasattr(mm_file, 'name'):
|
| 525 |
+
path = mm_file.name
|
| 526 |
+
else:
|
| 527 |
+
path = getattr(mm_file, 'url', getattr(mm_file, 'orig_name', str(mm_file)))
|
| 528 |
+
|
| 529 |
+
if is_image(path):
|
| 530 |
+
return "image"
|
| 531 |
+
if is_video(path):
|
| 532 |
+
return "video"
|
| 533 |
+
return None
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
def encode_mm_file(mm_file, choose_fps=None):
|
| 537 |
+
"""编码多媒体文件"""
|
| 538 |
+
if check_mm_type(mm_file) == 'image':
|
| 539 |
+
return encode_image(mm_file), None
|
| 540 |
+
if check_mm_type(mm_file) == 'video':
|
| 541 |
+
encoded_frames, frame_ts_id_group = encode_video(mm_file, choose_fps)
|
| 542 |
+
return encoded_frames, frame_ts_id_group
|
| 543 |
+
return None, None
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
def encode_message(_question, choose_fps=None):
|
| 547 |
+
"""编码消息"""
|
| 548 |
+
import re
|
| 549 |
+
|
| 550 |
+
files = _question.files if _question.files else []
|
| 551 |
+
question = _question.text if _question.text else ""
|
| 552 |
+
message = []
|
| 553 |
+
temporal_ids = []
|
| 554 |
+
|
| 555 |
+
# 检查是否使用旧的占位符格式
|
| 556 |
+
pattern = r"\[mm_media\]\d+\[/mm_media\]"
|
| 557 |
+
if re.search(pattern, question):
|
| 558 |
+
# 旧格式:使用占位符
|
| 559 |
+
matches = re.split(pattern, question)
|
| 560 |
+
|
| 561 |
+
if len(matches) != len(files) + 1:
|
| 562 |
+
gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
|
| 563 |
+
# 不使用 assert,而是处理不匹配的情况
|
| 564 |
+
if len(matches) > len(files) + 1:
|
| 565 |
+
matches = matches[:len(files) + 1]
|
| 566 |
+
else:
|
| 567 |
+
while len(matches) < len(files) + 1:
|
| 568 |
+
matches.append("")
|
| 569 |
+
|
| 570 |
+
text = matches[0].strip()
|
| 571 |
+
if text:
|
| 572 |
+
message.append({"type": "text", "pairs": text})
|
| 573 |
+
|
| 574 |
+
for i in range(len(files)):
|
| 575 |
+
encoded_content, frame_ts_id_group = encode_mm_file(files[i], choose_fps)
|
| 576 |
+
if encoded_content:
|
| 577 |
+
message += encoded_content
|
| 578 |
+
if frame_ts_id_group:
|
| 579 |
+
temporal_ids.extend(frame_ts_id_group)
|
| 580 |
+
|
| 581 |
+
if i + 1 < len(matches):
|
| 582 |
+
text = matches[i + 1].strip()
|
| 583 |
+
if text:
|
| 584 |
+
message.append({"type": "text", "pairs": text})
|
| 585 |
+
else:
|
| 586 |
+
# 新格式:简单的文本 + 文件列表
|
| 587 |
+
if question.strip():
|
| 588 |
+
message.append({"type": "text", "pairs": question.strip()})
|
| 589 |
+
|
| 590 |
+
for file in files:
|
| 591 |
+
encoded_content, frame_ts_id_group = encode_mm_file(file, choose_fps)
|
| 592 |
+
if encoded_content:
|
| 593 |
+
message += encoded_content
|
| 594 |
+
if frame_ts_id_group:
|
| 595 |
+
temporal_ids.extend(frame_ts_id_group)
|
| 596 |
+
|
| 597 |
+
return message, temporal_ids if temporal_ids else None
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def check_has_videos(_question):
|
| 601 |
+
"""检查是否包含视频"""
|
| 602 |
+
images_cnt = 0
|
| 603 |
+
videos_cnt = 0
|
| 604 |
+
files = _question.files if _question.files else []
|
| 605 |
+
for file in files:
|
| 606 |
+
if check_mm_type(file) == "image":
|
| 607 |
+
images_cnt += 1
|
| 608 |
+
else:
|
| 609 |
+
videos_cnt += 1
|
| 610 |
+
return images_cnt, videos_cnt
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
def save_media_to_persistent_cache(_question, session_id):
|
| 614 |
+
"""将图片和视频保存到持久化缓存中,返回保存的路径信息"""
|
| 615 |
+
import os
|
| 616 |
+
import shutil
|
| 617 |
+
import uuid
|
| 618 |
+
from pathlib import Path
|
| 619 |
+
|
| 620 |
+
files = _question.files if _question.files else []
|
| 621 |
+
saved_media = []
|
| 622 |
+
|
| 623 |
+
# 创建会话专用的媒体缓存目录
|
| 624 |
+
cache_dir = Path("./media_cache") / session_id
|
| 625 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 626 |
+
|
| 627 |
+
for file in files:
|
| 628 |
+
file_type = check_mm_type(file)
|
| 629 |
+
if file_type in ["image", "video"]:
|
| 630 |
+
try:
|
| 631 |
+
# 获取原始文件路径
|
| 632 |
+
original_path = None
|
| 633 |
+
if hasattr(file, 'name'):
|
| 634 |
+
original_path = file.name
|
| 635 |
+
elif hasattr(file, 'path'):
|
| 636 |
+
original_path = file.path
|
| 637 |
+
elif hasattr(file, 'file') and hasattr(file.file, 'path'):
|
| 638 |
+
original_path = file.file.path
|
| 639 |
+
else:
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
if original_path and os.path.exists(original_path):
|
| 643 |
+
# 生成唯一的文件名
|
| 644 |
+
file_ext = os.path.splitext(original_path)[1]
|
| 645 |
+
prefix = "img" if file_type == "image" else "vid"
|
| 646 |
+
unique_filename = f"{prefix}_{uuid.uuid4().hex[:8]}{file_ext}"
|
| 647 |
+
cached_path = cache_dir / unique_filename
|
| 648 |
+
|
| 649 |
+
# 复制文件到缓存目录
|
| 650 |
+
shutil.copy2(original_path, cached_path)
|
| 651 |
+
|
| 652 |
+
saved_media.append({
|
| 653 |
+
'type': file_type,
|
| 654 |
+
'original_path': original_path,
|
| 655 |
+
'cached_path': str(cached_path),
|
| 656 |
+
'filename': unique_filename
|
| 657 |
+
})
|
| 658 |
+
print(f"[save_media_to_persistent_cache] {file_type}已保存: {cached_path}")
|
| 659 |
+
except Exception as e:
|
| 660 |
+
print(f"[save_media_to_persistent_cache] 保存{file_type}失败: {e}")
|
| 661 |
+
continue
|
| 662 |
+
|
| 663 |
+
return saved_media
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
def format_user_message_with_files(_question, session_id=None):
|
| 667 |
+
"""格式化包含文件的用户消息,支持图片和视频显示"""
|
| 668 |
+
user_text = _question.text if _question.text else ""
|
| 669 |
+
files = _question.files if _question.files else []
|
| 670 |
+
|
| 671 |
+
if not files:
|
| 672 |
+
return user_text, []
|
| 673 |
+
|
| 674 |
+
# 保存媒体文件到持久化缓存
|
| 675 |
+
saved_media = []
|
| 676 |
+
if session_id:
|
| 677 |
+
saved_media = save_media_to_persistent_cache(_question, session_id)
|
| 678 |
+
|
| 679 |
+
if len(files) == 1:
|
| 680 |
+
file = files[0]
|
| 681 |
+
file_type = check_mm_type(file)
|
| 682 |
+
|
| 683 |
+
# 如果是图片或视频且已保存到缓存
|
| 684 |
+
if file_type in ["image", "video"] and saved_media:
|
| 685 |
+
media_info = saved_media[0]
|
| 686 |
+
if file_type == "image":
|
| 687 |
+
if user_text:
|
| 688 |
+
return f"🖼️ {user_text}", saved_media
|
| 689 |
+
else:
|
| 690 |
+
return "🖼️ 图片", saved_media
|
| 691 |
+
elif file_type == "video":
|
| 692 |
+
if user_text:
|
| 693 |
+
return f"🎬 {user_text}", saved_media
|
| 694 |
+
else:
|
| 695 |
+
return "🎬 视频", saved_media
|
| 696 |
+
else:
|
| 697 |
+
# 其他文件类型,使用文本描述
|
| 698 |
+
return f"[1 file uploaded] {user_text}", saved_media
|
| 699 |
+
else:
|
| 700 |
+
# 多个文件,统计不同类型
|
| 701 |
+
image_count = len([m for m in saved_media if m['type'] == 'image'])
|
| 702 |
+
video_count = len([m for m in saved_media if m['type'] == 'video'])
|
| 703 |
+
other_count = len(files) - image_count - video_count
|
| 704 |
+
|
| 705 |
+
# 构建描述文本
|
| 706 |
+
parts = []
|
| 707 |
+
if image_count > 0:
|
| 708 |
+
parts.append(f"{image_count} image{'s' if image_count > 1 else ''}")
|
| 709 |
+
if video_count > 0:
|
| 710 |
+
parts.append(f"{video_count} video{'s' if video_count > 1 else ''}")
|
| 711 |
+
if other_count > 0:
|
| 712 |
+
parts.append(f"{other_count} other file{'s' if other_count > 1 else ''}")
|
| 713 |
+
|
| 714 |
+
if parts:
|
| 715 |
+
files_desc = ", ".join(parts)
|
| 716 |
+
return f"[{files_desc} uploaded] {user_text}", saved_media
|
| 717 |
+
else:
|
| 718 |
+
return f"[{len(files)} files uploaded] {user_text}", saved_media
|
| 719 |
+
|
| 720 |
+
|
| 721 |
+
def update_media_gallery(app_session):
|
| 722 |
+
"""更新媒体画廊显示(图片和视频)"""
|
| 723 |
+
import os
|
| 724 |
+
media_cache = app_session.get('media_cache', [])
|
| 725 |
+
|
| 726 |
+
if not media_cache:
|
| 727 |
+
return gr.update(value=[], visible=False)
|
| 728 |
+
|
| 729 |
+
# 获取所有缓存媒体文件的路径(图片和视频都支持)
|
| 730 |
+
media_paths = [media_info['cached_path'] for media_info in media_cache if os.path.exists(media_info['cached_path'])]
|
| 731 |
+
|
| 732 |
+
if media_paths:
|
| 733 |
+
return gr.update(value=media_paths, visible=True)
|
| 734 |
+
else:
|
| 735 |
+
return gr.update(value=[], visible=False)
|
| 736 |
+
|
| 737 |
+
|
| 738 |
+
def format_fewshot_user_message(image_path, user_text):
|
| 739 |
+
"""格式化FewShot用户消息,支持图片显示"""
|
| 740 |
+
if image_path and user_text:
|
| 741 |
+
return (user_text, image_path)
|
| 742 |
+
elif image_path:
|
| 743 |
+
return ("", image_path)
|
| 744 |
+
else:
|
| 745 |
+
return user_text
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
# 主要的聊天函数
|
| 749 |
+
def chat_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
|
| 750 |
+
"""直接调用模型进行聊天(非流式)"""
|
| 751 |
+
default_params = {"num_beams": 3, "repetition_penalty": 1.2, "max_new_tokens": 16284}
|
| 752 |
+
if params is None:
|
| 753 |
+
params = default_params
|
| 754 |
+
|
| 755 |
+
use_streaming = params.get('stream', False)
|
| 756 |
+
|
| 757 |
+
if use_streaming:
|
| 758 |
+
return chat_stream_direct(img_b64, msgs, ctx, params, vision_hidden_states, temporal_ids, session_id)
|
| 759 |
+
else:
|
| 760 |
+
# 构建请求数据
|
| 761 |
+
query = {
|
| 762 |
+
"image": img_b64,
|
| 763 |
+
"question": json.dumps(msgs, ensure_ascii=True),
|
| 764 |
+
"params": json.dumps(params, ensure_ascii=True),
|
| 765 |
+
}
|
| 766 |
+
|
| 767 |
+
if temporal_ids:
|
| 768 |
+
query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
|
| 769 |
+
|
| 770 |
+
if session_id:
|
| 771 |
+
query["session_id"] = session_id
|
| 772 |
+
|
| 773 |
+
try:
|
| 774 |
+
# 直接调用模型
|
| 775 |
+
result = global_model.handler(query)
|
| 776 |
+
raw_result = result['result']
|
| 777 |
+
|
| 778 |
+
# 清理结果
|
| 779 |
+
import re
|
| 780 |
+
cleaned_result = re.sub(r'(<box>.*</box>)', '', raw_result)
|
| 781 |
+
cleaned_result = cleaned_result.replace('<ref>', '')
|
| 782 |
+
cleaned_result = cleaned_result.replace('</ref>', '')
|
| 783 |
+
cleaned_result = cleaned_result.replace('<box>', '')
|
| 784 |
+
cleaned_result = cleaned_result.replace('</box>', '')
|
| 785 |
+
|
| 786 |
+
# 解析思考过程
|
| 787 |
+
thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
|
| 788 |
+
thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
|
| 789 |
+
formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
|
| 790 |
+
formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
|
| 791 |
+
|
| 792 |
+
context_result = formal_answer_raw if formal_answer_raw else cleaned_result
|
| 793 |
+
return 0, formatted_result, context_result, None
|
| 794 |
+
|
| 795 |
+
except Exception as e:
|
| 796 |
+
print(f"Chat error: {e}")
|
| 797 |
+
import traceback
|
| 798 |
+
traceback.print_exc()
|
| 799 |
+
return -1, ERROR_MSG, None, None
|
| 800 |
+
|
| 801 |
+
|
| 802 |
+
def chat_stream_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
|
| 803 |
+
"""直接调用模型进行流式聊天"""
|
| 804 |
+
try:
|
| 805 |
+
# 构建请求数据
|
| 806 |
+
query = {
|
| 807 |
+
"image": img_b64,
|
| 808 |
+
"question": json.dumps(msgs, ensure_ascii=True),
|
| 809 |
+
"params": json.dumps(params, ensure_ascii=True),
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
if temporal_ids:
|
| 813 |
+
query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
|
| 814 |
+
|
| 815 |
+
if session_id:
|
| 816 |
+
query["session_id"] = session_id
|
| 817 |
+
|
| 818 |
+
# 直接调用流式处理器
|
| 819 |
+
generator = global_model.stream_handler(query)
|
| 820 |
+
|
| 821 |
+
full_response = ""
|
| 822 |
+
for chunk in generator:
|
| 823 |
+
full_response += chunk
|
| 824 |
+
|
| 825 |
+
if not full_response:
|
| 826 |
+
return -1, ERROR_MSG, None, None
|
| 827 |
+
|
| 828 |
+
# 清理结果
|
| 829 |
+
import re
|
| 830 |
+
cleaned_result = re.sub(r'(<box>.*</box>)', '', full_response)
|
| 831 |
+
cleaned_result = cleaned_result.replace('<ref>', '')
|
| 832 |
+
cleaned_result = cleaned_result.replace('</ref>', '')
|
| 833 |
+
cleaned_result = cleaned_result.replace('<box>', '')
|
| 834 |
+
cleaned_result = cleaned_result.replace('</box>', '')
|
| 835 |
+
|
| 836 |
+
# 解析思考过程
|
| 837 |
+
thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
|
| 838 |
+
thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
|
| 839 |
+
formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
|
| 840 |
+
formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
|
| 841 |
+
|
| 842 |
+
context_result = formal_answer_raw if formal_answer_raw else cleaned_result
|
| 843 |
+
return 0, formatted_result, context_result, None
|
| 844 |
+
|
| 845 |
+
except Exception as e:
|
| 846 |
+
print(f"Stream chat error: {e}")
|
| 847 |
+
import traceback
|
| 848 |
+
traceback.print_exc()
|
| 849 |
+
return -1, ERROR_MSG, None, None
|
| 850 |
+
|
| 851 |
+
|
| 852 |
+
def chat_stream_character_generator(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, stop_control=None, session_id=None):
|
| 853 |
+
"""字符级流式生成器"""
|
| 854 |
+
print(f"[chat_stream_character_generator] Starting character-level streaming")
|
| 855 |
+
print(f"[chat_stream_character_generator] stop_control: {stop_control}")
|
| 856 |
+
|
| 857 |
+
try:
|
| 858 |
+
# 构建请求数据
|
| 859 |
+
query = {
|
| 860 |
+
"image": img_b64,
|
| 861 |
+
"question": json.dumps(msgs, ensure_ascii=True),
|
| 862 |
+
"params": json.dumps(params, ensure_ascii=True),
|
| 863 |
+
}
|
| 864 |
+
|
| 865 |
+
if temporal_ids:
|
| 866 |
+
query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
|
| 867 |
+
|
| 868 |
+
if session_id:
|
| 869 |
+
query["session_id"] = session_id
|
| 870 |
+
|
| 871 |
+
# 调用流式处理器 - 现在返回完整响应而不是生成器
|
| 872 |
+
full_response = global_model.stream_handler(query)
|
| 873 |
+
|
| 874 |
+
# 清理响应
|
| 875 |
+
import re
|
| 876 |
+
clean_response = re.sub(r'(<box>.*</box>)', '', full_response)
|
| 877 |
+
clean_response = clean_response.replace('<ref>', '')
|
| 878 |
+
clean_response = clean_response.replace('</ref>', '')
|
| 879 |
+
clean_response = clean_response.replace('<box>', '')
|
| 880 |
+
clean_response = clean_response.replace('</box>', '')
|
| 881 |
+
|
| 882 |
+
# 逐字符yield以模拟流式输出
|
| 883 |
+
char_count = 0
|
| 884 |
+
for char in clean_response:
|
| 885 |
+
# 检查停止标志
|
| 886 |
+
if stop_control and stop_control.get('stop_streaming', False):
|
| 887 |
+
print(f"[chat_stream_character_generator] *** 在第{char_count}个字符处收到停止信号 ***")
|
| 888 |
+
break
|
| 889 |
+
|
| 890 |
+
char_count += 1
|
| 891 |
+
if char_count % 10 == 0:
|
| 892 |
+
print(f"[chat_stream_character_generator] 已输出{char_count}个字符,stop_flag: {stop_control.get('stop_streaming', False) if stop_control else 'None'}")
|
| 893 |
+
|
| 894 |
+
yield char
|
| 895 |
+
|
| 896 |
+
# 添加小延迟以模拟流式效果
|
| 897 |
+
import time
|
| 898 |
+
time.sleep(0.01)
|
| 899 |
+
|
| 900 |
+
print(f"[chat_stream_character_generator] 流式输出完成,总共输出{char_count}个字符")
|
| 901 |
+
|
| 902 |
+
except Exception as e:
|
| 903 |
+
print(f"[chat_stream_character_generator] 异常: {e}")
|
| 904 |
+
error_msg = f"Stream error: {str(e)}"
|
| 905 |
+
for char in error_msg:
|
| 906 |
+
yield char
|
| 907 |
+
|
| 908 |
+
|
| 909 |
+
# UI组件创建函数
|
| 910 |
+
def create_component(params, comp='Slider'):
|
| 911 |
+
if comp == 'Slider':
|
| 912 |
+
return gr.Slider(
|
| 913 |
+
minimum=params['minimum'],
|
| 914 |
+
maximum=params['maximum'],
|
| 915 |
+
value=params['value'],
|
| 916 |
+
step=params['step'],
|
| 917 |
+
interactive=params['interactive'],
|
| 918 |
+
label=params['label']
|
| 919 |
+
)
|
| 920 |
+
elif comp == 'Radio':
|
| 921 |
+
return gr.Radio(
|
| 922 |
+
choices=params['choices'],
|
| 923 |
+
value=params['value'],
|
| 924 |
+
interactive=params['interactive'],
|
| 925 |
+
label=params['label']
|
| 926 |
+
)
|
| 927 |
+
elif comp == 'Button':
|
| 928 |
+
return gr.Button(
|
| 929 |
+
value=params['value'],
|
| 930 |
+
interactive=True
|
| 931 |
+
)
|
| 932 |
+
elif comp == 'Checkbox':
|
| 933 |
+
return gr.Checkbox(
|
| 934 |
+
value=params['value'],
|
| 935 |
+
interactive=params['interactive'],
|
| 936 |
+
label=params['label'],
|
| 937 |
+
info=params.get('info', None)
|
| 938 |
+
)
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
|
| 942 |
+
# 使用标准的 Gradio 组件替代 MultimodalInput,添加预览功能
|
| 943 |
+
return gr.File(
|
| 944 |
+
file_count="multiple",
|
| 945 |
+
file_types=["image", "video"],
|
| 946 |
+
label="Upload Images/Videos",
|
| 947 |
+
interactive=not (upload_image_disabled and upload_video_disabled),
|
| 948 |
+
show_label=True,
|
| 949 |
+
height=200 # 设置高度以显示预览
|
| 950 |
+
)
|
| 951 |
+
|
| 952 |
+
|
| 953 |
+
# UI控制函数
|
| 954 |
+
def update_streaming_mode_state(params_form):
|
| 955 |
+
"""根据解码类型更新流式模式状态"""
|
| 956 |
+
if params_form == 'Beam Search':
|
| 957 |
+
return gr.update(value=False, interactive=False, info="Beam Search mode does not support streaming output")
|
| 958 |
+
else:
|
| 959 |
+
return gr.update(value=True, interactive=True, info="Enable real-time streaming response")
|
| 960 |
+
|
| 961 |
+
|
| 962 |
+
def stop_streaming(_app_cfg):
|
| 963 |
+
"""停止流式输出"""
|
| 964 |
+
_app_cfg['stop_streaming'] = True
|
| 965 |
+
print(f"[stop_streaming] Set stop flag to True")
|
| 966 |
+
return _app_cfg
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
def reset_stop_flag(_app_cfg):
|
| 970 |
+
"""重置停止标志"""
|
| 971 |
+
_app_cfg['stop_streaming'] = False
|
| 972 |
+
print(f"[reset_stop_flag] Reset stop flag to False")
|
| 973 |
+
return _app_cfg
|
| 974 |
+
|
| 975 |
+
|
| 976 |
+
def check_and_handle_stop(_app_cfg, context="unknown"):
|
| 977 |
+
"""检查停止标志"""
|
| 978 |
+
should_stop = _app_cfg.get('stop_streaming', False)
|
| 979 |
+
is_streaming = _app_cfg.get('is_streaming', False)
|
| 980 |
+
|
| 981 |
+
if should_stop:
|
| 982 |
+
print(f"[check_and_handle_stop] *** Stop signal detected at {context} ***")
|
| 983 |
+
print(f"[check_and_handle_stop] stop_streaming: {should_stop}, is_streaming: {is_streaming}")
|
| 984 |
+
return True
|
| 985 |
+
return False
|
| 986 |
+
|
| 987 |
+
|
| 988 |
+
def stop_button_clicked(_app_cfg):
|
| 989 |
+
"""处理停止按钮点击"""
|
| 990 |
+
print("[stop_button_clicked] *** Stop button clicked ***")
|
| 991 |
+
print(f"[stop_button_clicked] Current state - is_streaming: {_app_cfg.get('is_streaming', False)}")
|
| 992 |
+
print(f"[stop_button_clicked] Current state - stop_streaming: {_app_cfg.get('stop_streaming', False)}")
|
| 993 |
+
|
| 994 |
+
_app_cfg['stop_streaming'] = True
|
| 995 |
+
_app_cfg['is_streaming'] = False
|
| 996 |
+
print(f"[stop_button_clicked] Set stop_streaming = True, is_streaming = False")
|
| 997 |
+
|
| 998 |
+
return _app_cfg, gr.update(visible=False)
|
| 999 |
+
|
| 1000 |
+
|
| 1001 |
+
# 主要的响应函数
|
| 1002 |
+
def respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1003 |
+
"""流式响应生成器"""
|
| 1004 |
+
print(f"[respond_stream] Called with streaming_mode: {streaming_mode}, fps_setting: {fps_setting}")
|
| 1005 |
+
|
| 1006 |
+
_app_cfg['is_streaming'] = True
|
| 1007 |
+
_app_cfg['stop_streaming'] = False
|
| 1008 |
+
|
| 1009 |
+
if params_form == 'Beam Search':
|
| 1010 |
+
streaming_mode = False
|
| 1011 |
+
print(f"[respond_stream] Beam Search模式,强制禁用流式模式")
|
| 1012 |
+
_app_cfg['is_streaming'] = False
|
| 1013 |
+
|
| 1014 |
+
_context = _app_cfg['ctx'].copy()
|
| 1015 |
+
encoded_message, temporal_ids = encode_message(_question, fps_setting)
|
| 1016 |
+
_context.append({'role': 'user', 'contents': encoded_message})
|
| 1017 |
+
|
| 1018 |
+
images_cnt = _app_cfg['images_cnt']
|
| 1019 |
+
videos_cnt = _app_cfg['videos_cnt']
|
| 1020 |
+
files_cnts = check_has_videos(_question)
|
| 1021 |
+
|
| 1022 |
+
if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
|
| 1023 |
+
gr.Warning("Only supports single video file input right now!")
|
| 1024 |
+
yield create_multimodal_input(True, True), _chat_bot, _app_cfg, gr.update(visible=False)
|
| 1025 |
+
return
|
| 1026 |
+
|
| 1027 |
+
if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
|
| 1028 |
+
gr.Warning("Please chat with at least one image or video.")
|
| 1029 |
+
yield create_multimodal_input(False, False), _chat_bot, _app_cfg, gr.update(visible=False)
|
| 1030 |
+
return
|
| 1031 |
+
|
| 1032 |
+
if params_form == 'Beam Search':
|
| 1033 |
+
params = {
|
| 1034 |
+
'sampling': False,
|
| 1035 |
+
'num_beams': 3,
|
| 1036 |
+
'repetition_penalty': 1.2,
|
| 1037 |
+
"max_new_tokens": 16284,
|
| 1038 |
+
"enable_thinking": thinking_mode,
|
| 1039 |
+
"stream": False
|
| 1040 |
+
}
|
| 1041 |
+
else:
|
| 1042 |
+
params = {
|
| 1043 |
+
'sampling': True,
|
| 1044 |
+
'top_p': 0.8,
|
| 1045 |
+
'top_k': 100,
|
| 1046 |
+
'temperature': 0.7,
|
| 1047 |
+
'repetition_penalty': 1.03,
|
| 1048 |
+
"max_new_tokens": 16284,
|
| 1049 |
+
"enable_thinking": thinking_mode,
|
| 1050 |
+
"stream": streaming_mode
|
| 1051 |
+
}
|
| 1052 |
+
|
| 1053 |
+
if files_cnts[1] + videos_cnt > 0:
|
| 1054 |
+
params["max_inp_length"] = 2048 * 10
|
| 1055 |
+
params["use_image_id"] = False
|
| 1056 |
+
params["max_slice_nums"] = 1
|
| 1057 |
+
|
| 1058 |
+
images_cnt += files_cnts[0]
|
| 1059 |
+
videos_cnt += files_cnts[1]
|
| 1060 |
+
|
| 1061 |
+
# 构建用户消息显示(流式模式)
|
| 1062 |
+
user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
|
| 1063 |
+
|
| 1064 |
+
# 将媒体信息保存到会话状态中
|
| 1065 |
+
if saved_images:
|
| 1066 |
+
if 'media_cache' not in _app_cfg:
|
| 1067 |
+
_app_cfg['media_cache'] = []
|
| 1068 |
+
_app_cfg['media_cache'].extend(saved_images)
|
| 1069 |
+
|
| 1070 |
+
_chat_bot.append((user_message, ""))
|
| 1071 |
+
_context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
|
| 1072 |
+
|
| 1073 |
+
gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
|
| 1074 |
+
|
| 1075 |
+
upload_image_disabled = videos_cnt > 0
|
| 1076 |
+
upload_video_disabled = videos_cnt > 0 or images_cnt > 0
|
| 1077 |
+
|
| 1078 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
|
| 1079 |
+
|
| 1080 |
+
print(f"[respond_stream] 开始字符级流式输出循环")
|
| 1081 |
+
char_count = 0
|
| 1082 |
+
accumulated_content = ""
|
| 1083 |
+
|
| 1084 |
+
for _char in gen:
|
| 1085 |
+
char_count += 1
|
| 1086 |
+
|
| 1087 |
+
if check_and_handle_stop(_app_cfg, f"字符{char_count}"):
|
| 1088 |
+
break
|
| 1089 |
+
|
| 1090 |
+
accumulated_content += _char
|
| 1091 |
+
_context[-1]["contents"][0]["pairs"] += _char
|
| 1092 |
+
|
| 1093 |
+
# 实时显示内容(thinking模式也实时显示)
|
| 1094 |
+
if thinking_mode:
|
| 1095 |
+
# 尝试解析当前累积的内容
|
| 1096 |
+
thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
|
| 1097 |
+
|
| 1098 |
+
# 如果解析出了完整的thinking内容,使用格式化显示
|
| 1099 |
+
if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
|
| 1100 |
+
thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
|
| 1101 |
+
formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
|
| 1102 |
+
formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
|
| 1103 |
+
_chat_bot[-1] = (user_message, formatted_display)
|
| 1104 |
+
else:
|
| 1105 |
+
# 正在thinking过程中或者还没有完整标签,直接显示原始内容(实时流式)
|
| 1106 |
+
_chat_bot[-1] = (user_message, accumulated_content)
|
| 1107 |
+
else:
|
| 1108 |
+
# 非thinking模式,直接显示累积内容
|
| 1109 |
+
_chat_bot[-1] = (user_message, accumulated_content)
|
| 1110 |
+
|
| 1111 |
+
if char_count % 5 == 0: # 更频繁的更新以提供更好的流式体验
|
| 1112 |
+
print(f"[respond_stream] 已处理{char_count}个字符,stop_flag: {_app_cfg.get('stop_streaming', False)}")
|
| 1113 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
|
| 1114 |
+
time.sleep(0.02) # 稍微增加延迟以避免过于频繁的更新
|
| 1115 |
+
else:
|
| 1116 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
|
| 1117 |
+
|
| 1118 |
+
if _app_cfg.get('stop_streaming', False):
|
| 1119 |
+
print("[respond_stream] 流式输出已停止")
|
| 1120 |
+
|
| 1121 |
+
# 最终处理thinking格式化
|
| 1122 |
+
final_content = accumulated_content
|
| 1123 |
+
if thinking_mode:
|
| 1124 |
+
thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(final_content)
|
| 1125 |
+
thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
|
| 1126 |
+
formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
|
| 1127 |
+
formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
|
| 1128 |
+
|
| 1129 |
+
_chat_bot[-1] = (user_message, formatted_result)
|
| 1130 |
+
_context[-1]["contents"][0]["pairs"] = formal_answer_raw if formal_answer_raw else final_content
|
| 1131 |
+
else:
|
| 1132 |
+
_chat_bot[-1] = (user_message, final_content)
|
| 1133 |
+
_context[-1]["contents"][0]["pairs"] = final_content
|
| 1134 |
+
|
| 1135 |
+
_app_cfg['ctx'] = _context
|
| 1136 |
+
_app_cfg['images_cnt'] = images_cnt
|
| 1137 |
+
_app_cfg['videos_cnt'] = videos_cnt
|
| 1138 |
+
_app_cfg['is_streaming'] = False
|
| 1139 |
+
|
| 1140 |
+
upload_image_disabled = videos_cnt > 0
|
| 1141 |
+
upload_video_disabled = videos_cnt > 0 or images_cnt > 0
|
| 1142 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
|
| 1143 |
+
|
| 1144 |
+
|
| 1145 |
+
def respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1146 |
+
"""主响应函数"""
|
| 1147 |
+
if 'session_id' not in _app_cfg:
|
| 1148 |
+
_app_cfg['session_id'] = uuid.uuid4().hex[:16]
|
| 1149 |
+
print(f"[会话] 为现有会话生成session_id: {_app_cfg['session_id']}")
|
| 1150 |
+
|
| 1151 |
+
# 记录thinking模式状态变化
|
| 1152 |
+
prev_thinking_mode = _app_cfg.get('last_thinking_mode', False)
|
| 1153 |
+
_app_cfg['last_thinking_mode'] = thinking_mode
|
| 1154 |
+
|
| 1155 |
+
if prev_thinking_mode != thinking_mode:
|
| 1156 |
+
print(f"[respond] Thinking模式切换: {prev_thinking_mode} -> {thinking_mode}")
|
| 1157 |
+
# 强制清理可能的缓存状态
|
| 1158 |
+
if hasattr(_app_cfg, 'thinking_cache'):
|
| 1159 |
+
del _app_cfg['thinking_cache']
|
| 1160 |
+
# 添加额外的状态重置
|
| 1161 |
+
if thinking_mode and not prev_thinking_mode:
|
| 1162 |
+
print("[respond] 启用thinking模式,重置相关状态")
|
| 1163 |
+
_app_cfg['thinking_enabled'] = True
|
| 1164 |
+
elif not thinking_mode and prev_thinking_mode:
|
| 1165 |
+
print("[respond] 禁用thinking模式")
|
| 1166 |
+
_app_cfg['thinking_enabled'] = False
|
| 1167 |
+
|
| 1168 |
+
if params_form == 'Beam Search':
|
| 1169 |
+
streaming_mode = False
|
| 1170 |
+
print(f"[respond] Beam Search模式,强制禁用流式模式")
|
| 1171 |
+
|
| 1172 |
+
if streaming_mode:
|
| 1173 |
+
print("[respond] 选择流式模式")
|
| 1174 |
+
yield from respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting)
|
| 1175 |
+
return
|
| 1176 |
+
|
| 1177 |
+
# 非流式模式
|
| 1178 |
+
_context = _app_cfg['ctx'].copy()
|
| 1179 |
+
encoded_message, temporal_ids = encode_message(_question, fps_setting)
|
| 1180 |
+
_context.append({'role': 'user', 'contents': encoded_message})
|
| 1181 |
+
|
| 1182 |
+
images_cnt = _app_cfg['images_cnt']
|
| 1183 |
+
videos_cnt = _app_cfg['videos_cnt']
|
| 1184 |
+
files_cnts = check_has_videos(_question)
|
| 1185 |
+
if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
|
| 1186 |
+
gr.Warning("Only supports single video file input right now!")
|
| 1187 |
+
upload_image_disabled = videos_cnt > 0
|
| 1188 |
+
upload_video_disabled = videos_cnt > 0 or images_cnt > 0
|
| 1189 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
|
| 1190 |
+
return
|
| 1191 |
+
if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
|
| 1192 |
+
gr.Warning("Please chat with at least one image or video.")
|
| 1193 |
+
upload_image_disabled = videos_cnt > 0
|
| 1194 |
+
upload_video_disabled = videos_cnt > 0 or images_cnt > 0
|
| 1195 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
|
| 1196 |
+
return
|
| 1197 |
+
|
| 1198 |
+
if params_form == 'Beam Search':
|
| 1199 |
+
params = {
|
| 1200 |
+
'sampling': False,
|
| 1201 |
+
'num_beams': 3,
|
| 1202 |
+
'repetition_penalty': 1.2,
|
| 1203 |
+
"max_new_tokens": 16284,
|
| 1204 |
+
"enable_thinking": thinking_mode,
|
| 1205 |
+
"stream": False
|
| 1206 |
+
}
|
| 1207 |
+
else:
|
| 1208 |
+
params = {
|
| 1209 |
+
'sampling': True,
|
| 1210 |
+
'top_p': 0.8,
|
| 1211 |
+
'top_k': 100,
|
| 1212 |
+
'temperature': 0.7,
|
| 1213 |
+
'repetition_penalty': 1.03,
|
| 1214 |
+
"max_new_tokens": 16284,
|
| 1215 |
+
"enable_thinking": thinking_mode,
|
| 1216 |
+
"stream": False
|
| 1217 |
+
}
|
| 1218 |
+
|
| 1219 |
+
if files_cnts[1] + videos_cnt > 0:
|
| 1220 |
+
params["max_inp_length"] = 2048 * 10
|
| 1221 |
+
params["use_image_id"] = False
|
| 1222 |
+
params["max_slice_nums"] = 1
|
| 1223 |
+
|
| 1224 |
+
# 调用聊天函数
|
| 1225 |
+
code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
|
| 1226 |
+
|
| 1227 |
+
images_cnt += files_cnts[0]
|
| 1228 |
+
videos_cnt += files_cnts[1]
|
| 1229 |
+
|
| 1230 |
+
if code == 0:
|
| 1231 |
+
context_content = _context_answer if _context_answer else _answer
|
| 1232 |
+
_context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
|
| 1233 |
+
|
| 1234 |
+
# 根据thinking_mode决定是否应用thinking格式化
|
| 1235 |
+
if thinking_mode:
|
| 1236 |
+
thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(_answer)
|
| 1237 |
+
thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
|
| 1238 |
+
formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
|
| 1239 |
+
print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, thinking_content: '{thinking_content_raw[:50]}...'")
|
| 1240 |
+
formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
|
| 1241 |
+
else:
|
| 1242 |
+
print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, 使用原始回答")
|
| 1243 |
+
formatted_result = _answer
|
| 1244 |
+
|
| 1245 |
+
# 构建用户消息显示
|
| 1246 |
+
user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
|
| 1247 |
+
|
| 1248 |
+
# 将媒体信息保存到会话状态中
|
| 1249 |
+
if saved_images:
|
| 1250 |
+
if 'media_cache' not in _app_cfg:
|
| 1251 |
+
_app_cfg['media_cache'] = []
|
| 1252 |
+
_app_cfg['media_cache'].extend(saved_images)
|
| 1253 |
+
|
| 1254 |
+
_chat_bot.append((user_message, formatted_result))
|
| 1255 |
+
|
| 1256 |
+
_app_cfg['ctx'] = _context
|
| 1257 |
+
_app_cfg['sts'] = sts
|
| 1258 |
+
else:
|
| 1259 |
+
_context.append({"role": "assistant", "contents": [{"type": "text", "pairs": "Error occurred during processing"}]})
|
| 1260 |
+
# 构建用户消息显示(错误情况)
|
| 1261 |
+
user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
|
| 1262 |
+
|
| 1263 |
+
# 将媒体信息保存到会话状态中
|
| 1264 |
+
if saved_images:
|
| 1265 |
+
if 'media_cache' not in _app_cfg:
|
| 1266 |
+
_app_cfg['media_cache'] = []
|
| 1267 |
+
_app_cfg['media_cache'].extend(saved_images)
|
| 1268 |
+
|
| 1269 |
+
_chat_bot.append((user_message, "Error occurred during processing"))
|
| 1270 |
+
|
| 1271 |
+
_app_cfg['images_cnt'] = images_cnt
|
| 1272 |
+
_app_cfg['videos_cnt'] = videos_cnt
|
| 1273 |
+
_app_cfg['is_streaming'] = False
|
| 1274 |
+
|
| 1275 |
+
upload_image_disabled = videos_cnt > 0
|
| 1276 |
+
upload_video_disabled = videos_cnt > 0 or images_cnt > 0
|
| 1277 |
+
|
| 1278 |
+
# 统一使用yield返回结果,确保与流式模式兼容
|
| 1279 |
+
yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
|
| 1280 |
+
|
| 1281 |
+
|
| 1282 |
+
# FewShot相关函数
|
| 1283 |
+
def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
|
| 1284 |
+
if 'session_id' not in _app_cfg:
|
| 1285 |
+
_app_cfg['session_id'] = uuid.uuid4().hex[:16]
|
| 1286 |
+
print(f"[会话] 为FewShot示例生成session_id: {_app_cfg['session_id']}")
|
| 1287 |
+
|
| 1288 |
+
ctx = _app_cfg["ctx"]
|
| 1289 |
+
|
| 1290 |
+
# 构建用户消息
|
| 1291 |
+
user_msg = ""
|
| 1292 |
+
if _image is not None:
|
| 1293 |
+
image = Image.open(_image).convert("RGB")
|
| 1294 |
+
ctx.append({"role": "user", "contents": [
|
| 1295 |
+
*encode_image(image),
|
| 1296 |
+
{"type": "text", "pairs": _user_message}
|
| 1297 |
+
]})
|
| 1298 |
+
user_msg = f"[Image uploaded] {_user_message}"
|
| 1299 |
+
else:
|
| 1300 |
+
if _user_message:
|
| 1301 |
+
ctx.append({"role": "user", "contents": [{"type": "text", "pairs": _user_message}]})
|
| 1302 |
+
user_msg = _user_message
|
| 1303 |
+
|
| 1304 |
+
# 构建助手消息
|
| 1305 |
+
if _assistant_message:
|
| 1306 |
+
ctx.append({"role": "assistant", "contents": [{"type": "text", "pairs": _assistant_message}]})
|
| 1307 |
+
|
| 1308 |
+
# 只有当用户消息和助手消息都存在时才添加到聊天记录
|
| 1309 |
+
if user_msg and _assistant_message:
|
| 1310 |
+
formatted_user_msg = format_fewshot_user_message(_image, _user_message) if _image else user_msg
|
| 1311 |
+
_chat_bot.append([formatted_user_msg, _assistant_message])
|
| 1312 |
+
|
| 1313 |
+
return None, "", "", _chat_bot, _app_cfg
|
| 1314 |
+
|
| 1315 |
+
|
| 1316 |
+
def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1317 |
+
"""FewShot响应函数"""
|
| 1318 |
+
print(f"[fewshot_respond] Called with streaming_mode: {streaming_mode}")
|
| 1319 |
+
|
| 1320 |
+
if 'session_id' not in _app_cfg:
|
| 1321 |
+
_app_cfg['session_id'] = uuid.uuid4().hex[:16]
|
| 1322 |
+
print(f"[会话] 为FewShot会话生成session_id: {_app_cfg['session_id']}")
|
| 1323 |
+
|
| 1324 |
+
if params_form == 'Beam Search':
|
| 1325 |
+
streaming_mode = False
|
| 1326 |
+
print(f"[fewshot_respond] Beam Search模式,强制禁用流式模式")
|
| 1327 |
+
|
| 1328 |
+
user_message_contents = []
|
| 1329 |
+
_context = _app_cfg["ctx"].copy()
|
| 1330 |
+
images_cnt = _app_cfg["images_cnt"]
|
| 1331 |
+
temporal_ids = None
|
| 1332 |
+
|
| 1333 |
+
if _image:
|
| 1334 |
+
image = Image.open(_image).convert("RGB")
|
| 1335 |
+
user_message_contents += encode_image(image)
|
| 1336 |
+
images_cnt += 1
|
| 1337 |
+
if _user_message:
|
| 1338 |
+
user_message_contents += [{"type": "text", "pairs": _user_message}]
|
| 1339 |
+
if user_message_contents:
|
| 1340 |
+
_context.append({"role": "user", "contents": user_message_contents})
|
| 1341 |
+
|
| 1342 |
+
if params_form == 'Beam Search':
|
| 1343 |
+
params = {
|
| 1344 |
+
'sampling': False,
|
| 1345 |
+
'num_beams': 3,
|
| 1346 |
+
'repetition_penalty': 1.2,
|
| 1347 |
+
"max_new_tokens": 16284,
|
| 1348 |
+
"enable_thinking": thinking_mode,
|
| 1349 |
+
"stream": False
|
| 1350 |
+
}
|
| 1351 |
+
else:
|
| 1352 |
+
params = {
|
| 1353 |
+
'sampling': True,
|
| 1354 |
+
'top_p': 0.8,
|
| 1355 |
+
'top_k': 100,
|
| 1356 |
+
'temperature': 0.7,
|
| 1357 |
+
'repetition_penalty': 1.03,
|
| 1358 |
+
"max_new_tokens": 16284,
|
| 1359 |
+
"enable_thinking": thinking_mode,
|
| 1360 |
+
"stream": streaming_mode
|
| 1361 |
+
}
|
| 1362 |
+
|
| 1363 |
+
if disable_text_only and images_cnt == 0:
|
| 1364 |
+
gr.Warning("Please chat with at least one image or video.")
|
| 1365 |
+
yield _image, _user_message, '', _chat_bot, _app_cfg
|
| 1366 |
+
return
|
| 1367 |
+
|
| 1368 |
+
if streaming_mode:
|
| 1369 |
+
print(f"[fewshot_respond] Using streaming mode")
|
| 1370 |
+
_app_cfg['is_streaming'] = True
|
| 1371 |
+
_app_cfg['stop_streaming'] = False
|
| 1372 |
+
|
| 1373 |
+
if _image:
|
| 1374 |
+
user_msg = format_fewshot_user_message(_image, _user_message)
|
| 1375 |
+
_chat_bot.append([user_msg, ""])
|
| 1376 |
+
else:
|
| 1377 |
+
_chat_bot.append([_user_message, ""])
|
| 1378 |
+
|
| 1379 |
+
_context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
|
| 1380 |
+
|
| 1381 |
+
_app_cfg['stop_streaming'] = False
|
| 1382 |
+
|
| 1383 |
+
gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
|
| 1384 |
+
|
| 1385 |
+
yield _image, _user_message, '', _chat_bot, _app_cfg
|
| 1386 |
+
|
| 1387 |
+
accumulated_content = ""
|
| 1388 |
+
for _char in gen:
|
| 1389 |
+
if _app_cfg.get('stop_streaming', False):
|
| 1390 |
+
print("[fewshot_respond] 收到停止信号,中断流式响应")
|
| 1391 |
+
break
|
| 1392 |
+
|
| 1393 |
+
accumulated_content += _char
|
| 1394 |
+
_context[-1]["contents"][0]["pairs"] += _char
|
| 1395 |
+
|
| 1396 |
+
# 实时解析和格式化thinking内容
|
| 1397 |
+
if thinking_mode:
|
| 1398 |
+
# 尝试解析当前累积的内容
|
| 1399 |
+
thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
|
| 1400 |
+
|
| 1401 |
+
# 如果解析出了完整的thinking内容,使用格式化显示
|
| 1402 |
+
if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
|
| 1403 |
+
thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
|
| 1404 |
+
formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
|
| 1405 |
+
formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
|
| 1406 |
+
_chat_bot[-1] = (_chat_bot[-1][0], formatted_display)
|
| 1407 |
+
else:
|
| 1408 |
+
# 正在thinking过程中或者还没有完整标签,直接显示原始内容(实时流式)
|
| 1409 |
+
_chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
|
| 1410 |
+
else:
|
| 1411 |
+
# 非thinking模式,直接显示累积内容
|
| 1412 |
+
_chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
|
| 1413 |
+
|
| 1414 |
+
yield _image, _user_message, '', _chat_bot, _app_cfg
|
| 1415 |
+
|
| 1416 |
+
final_content = _context[-1]["contents"][0]["pairs"]
|
| 1417 |
+
|
| 1418 |
+
_app_cfg['ctx'] = _context
|
| 1419 |
+
_app_cfg['images_cnt'] = images_cnt
|
| 1420 |
+
_app_cfg['is_streaming'] = False
|
| 1421 |
+
|
| 1422 |
+
yield _image, '', '', _chat_bot, _app_cfg
|
| 1423 |
+
|
| 1424 |
+
else:
|
| 1425 |
+
# 非流式模式
|
| 1426 |
+
code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
|
| 1427 |
+
|
| 1428 |
+
context_content = _context_answer if _context_answer else _answer
|
| 1429 |
+
_context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
|
| 1430 |
+
|
| 1431 |
+
if _image:
|
| 1432 |
+
user_msg = format_fewshot_user_message(_image, _user_message)
|
| 1433 |
+
_chat_bot.append([user_msg, _answer])
|
| 1434 |
+
else:
|
| 1435 |
+
_chat_bot.append([_user_message, _answer])
|
| 1436 |
+
|
| 1437 |
+
if code == 0:
|
| 1438 |
+
_app_cfg['ctx'] = _context
|
| 1439 |
+
_app_cfg['sts'] = sts
|
| 1440 |
+
_app_cfg['images_cnt'] = images_cnt
|
| 1441 |
+
|
| 1442 |
+
_app_cfg['is_streaming'] = False
|
| 1443 |
+
yield None, '', '', _chat_bot, _app_cfg
|
| 1444 |
+
|
| 1445 |
+
|
| 1446 |
+
# 其他UI函数
|
| 1447 |
+
def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1448 |
+
print(f"[regenerate] streaming_mode: {streaming_mode}")
|
| 1449 |
+
print(f"[regenerate] thinking_mode: {thinking_mode}")
|
| 1450 |
+
print(f"[regenerate] chat_type: {_app_cfg.get('chat_type', 'unknown')}")
|
| 1451 |
+
|
| 1452 |
+
if params_form == 'Beam Search':
|
| 1453 |
+
streaming_mode = False
|
| 1454 |
+
print(f"[regenerate] Beam Search模式,强制禁用流式模式")
|
| 1455 |
+
|
| 1456 |
+
if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
|
| 1457 |
+
gr.Warning('No question for regeneration.')
|
| 1458 |
+
yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
|
| 1459 |
+
return
|
| 1460 |
+
|
| 1461 |
+
if _app_cfg["chat_type"] == "Chat":
|
| 1462 |
+
images_cnt = _app_cfg['images_cnt']
|
| 1463 |
+
videos_cnt = _app_cfg['videos_cnt']
|
| 1464 |
+
_question = _chat_bot[-1][0]
|
| 1465 |
+
_chat_bot = _chat_bot[:-1]
|
| 1466 |
+
_app_cfg['ctx'] = _app_cfg['ctx'][:-2]
|
| 1467 |
+
files_cnts = check_has_videos(_question)
|
| 1468 |
+
images_cnt -= files_cnts[0]
|
| 1469 |
+
videos_cnt -= files_cnts[1]
|
| 1470 |
+
_app_cfg['images_cnt'] = images_cnt
|
| 1471 |
+
_app_cfg['videos_cnt'] = videos_cnt
|
| 1472 |
+
|
| 1473 |
+
print(f"[regenerate] About to call respond with streaming_mode: {streaming_mode}")
|
| 1474 |
+
for result in respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1475 |
+
new_input, _chat_bot, _app_cfg, _stop_button = result
|
| 1476 |
+
_question = new_input
|
| 1477 |
+
yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
|
| 1478 |
+
else:
|
| 1479 |
+
# 在 tuples 格式下,_chat_bot[-1][0] 是字符串
|
| 1480 |
+
last_user_message = _chat_bot[-1][0]
|
| 1481 |
+
last_image = None
|
| 1482 |
+
|
| 1483 |
+
# 检查消息是否包含图片标识
|
| 1484 |
+
if "[Image uploaded]" in last_user_message:
|
| 1485 |
+
# 从消息中提取实际的用户消息
|
| 1486 |
+
last_user_message = last_user_message.replace("[Image uploaded] ", "")
|
| 1487 |
+
# 注意:在简化的 tuples 格式下,我们无法直接获取图片文件
|
| 1488 |
+
# 这里需要根据实际需要进行处理
|
| 1489 |
+
_chat_bot = _chat_bot[:-1]
|
| 1490 |
+
_app_cfg['ctx'] = _app_cfg['ctx'][:-2]
|
| 1491 |
+
|
| 1492 |
+
print(f"[regenerate] About to call fewshot_respond with streaming_mode: {streaming_mode}")
|
| 1493 |
+
for result in fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1494 |
+
_image, _user_message, _assistant_message, _chat_bot, _app_cfg = result
|
| 1495 |
+
yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
|
| 1496 |
+
|
| 1497 |
+
|
| 1498 |
+
def flushed():
|
| 1499 |
+
return gr.update(interactive=True)
|
| 1500 |
+
|
| 1501 |
+
|
| 1502 |
+
def clear_media_cache(session_id):
|
| 1503 |
+
"""清理指定会话的媒体缓存"""
|
| 1504 |
+
import shutil
|
| 1505 |
+
from pathlib import Path
|
| 1506 |
+
|
| 1507 |
+
try:
|
| 1508 |
+
cache_dir = Path("./media_cache") / session_id
|
| 1509 |
+
if cache_dir.exists():
|
| 1510 |
+
shutil.rmtree(cache_dir)
|
| 1511 |
+
print(f"[clear_media_cache] 已清理会话 {session_id} 的媒体缓存")
|
| 1512 |
+
except Exception as e:
|
| 1513 |
+
print(f"[clear_media_cache] 清理缓存失败: {e}")
|
| 1514 |
+
|
| 1515 |
+
|
| 1516 |
+
def clear(txt_input, file_upload, chat_bot, app_session):
|
| 1517 |
+
# 清理旧会话的媒体缓存
|
| 1518 |
+
if 'session_id' in app_session:
|
| 1519 |
+
clear_media_cache(app_session['session_id'])
|
| 1520 |
+
|
| 1521 |
+
chat_bot = copy.deepcopy(init_conversation)
|
| 1522 |
+
app_session['sts'] = None
|
| 1523 |
+
app_session['ctx'] = []
|
| 1524 |
+
app_session['images_cnt'] = 0
|
| 1525 |
+
app_session['videos_cnt'] = 0
|
| 1526 |
+
app_session['stop_streaming'] = False
|
| 1527 |
+
app_session['is_streaming'] = False
|
| 1528 |
+
app_session['media_cache'] = [] # 清空媒体缓存信息
|
| 1529 |
+
app_session['last_thinking_mode'] = False # 重置thinking模式状态
|
| 1530 |
+
app_session['session_id'] = uuid.uuid4().hex[:16]
|
| 1531 |
+
print(f"[会话] 生成新会话ID: {app_session['session_id']}")
|
| 1532 |
+
return "", None, gr.update(value=[], visible=False), gr.update(value=[], visible=False), chat_bot, app_session, None, '', ''
|
| 1533 |
+
|
| 1534 |
+
|
| 1535 |
+
def select_chat_type(_tab, _app_cfg):
|
| 1536 |
+
_app_cfg["chat_type"] = _tab
|
| 1537 |
+
return _app_cfg
|
| 1538 |
+
|
| 1539 |
+
|
| 1540 |
+
# UI配置
|
| 1541 |
+
form_radio = {
|
| 1542 |
+
'choices': ['Beam Search', 'Sampling'],
|
| 1543 |
+
'value': 'Sampling',
|
| 1544 |
+
'interactive': True,
|
| 1545 |
+
'label': 'Decode Type'
|
| 1546 |
+
}
|
| 1547 |
+
|
| 1548 |
+
thinking_checkbox = {
|
| 1549 |
+
'value': False,
|
| 1550 |
+
'interactive': True,
|
| 1551 |
+
'label': 'Enable Thinking Mode',
|
| 1552 |
+
}
|
| 1553 |
+
|
| 1554 |
+
streaming_checkbox = {
|
| 1555 |
+
'value': True,
|
| 1556 |
+
'interactive': True,
|
| 1557 |
+
'label': 'Enable Streaming Mode',
|
| 1558 |
+
}
|
| 1559 |
+
|
| 1560 |
+
fps_slider = {
|
| 1561 |
+
'minimum': 1,
|
| 1562 |
+
'maximum': 20,
|
| 1563 |
+
'value': 3,
|
| 1564 |
+
'step': 1,
|
| 1565 |
+
'interactive': True,
|
| 1566 |
+
'label': 'Custom FPS for Video Processing'
|
| 1567 |
+
}
|
| 1568 |
+
|
| 1569 |
+
init_conversation = [
|
| 1570 |
+
["", "You can talk to me now"]
|
| 1571 |
+
]
|
| 1572 |
+
|
| 1573 |
+
css = """
|
| 1574 |
+
video { height: auto !important; }
|
| 1575 |
+
.example label { font-size: 16px;}
|
| 1576 |
+
|
| 1577 |
+
/* Current Media Gallery 滚动条样式 - 使用class选择器更安全 */
|
| 1578 |
+
.current-media-gallery {
|
| 1579 |
+
overflow-y: auto !important;
|
| 1580 |
+
max-height: 600px !important;
|
| 1581 |
+
position: relative !important;
|
| 1582 |
+
}
|
| 1583 |
+
|
| 1584 |
+
/* 确保只影响特定的Gallery容器内部 */
|
| 1585 |
+
.current-media-gallery > div,
|
| 1586 |
+
.current-media-gallery .gallery-container {
|
| 1587 |
+
overflow-y: auto !important;
|
| 1588 |
+
max-height: 580px !important;
|
| 1589 |
+
}
|
| 1590 |
+
|
| 1591 |
+
.current-media-gallery .gallery-item {
|
| 1592 |
+
margin-bottom: 10px !important;
|
| 1593 |
+
}
|
| 1594 |
+
|
| 1595 |
+
/* 只为Current Media Gallery自定义滚动条样式 */
|
| 1596 |
+
.current-media-gallery::-webkit-scrollbar,
|
| 1597 |
+
.current-media-gallery > div::-webkit-scrollbar,
|
| 1598 |
+
.current-media-gallery .gallery-container::-webkit-scrollbar {
|
| 1599 |
+
width: 8px !important;
|
| 1600 |
+
}
|
| 1601 |
+
|
| 1602 |
+
.current-media-gallery::-webkit-scrollbar-track,
|
| 1603 |
+
.current-media-gallery > div::-webkit-scrollbar-track,
|
| 1604 |
+
.current-media-gallery .gallery-container::-webkit-scrollbar-track {
|
| 1605 |
+
background: #f1f1f1 !important;
|
| 1606 |
+
border-radius: 4px !important;
|
| 1607 |
+
}
|
| 1608 |
+
|
| 1609 |
+
.current-media-gallery::-webkit-scrollbar-thumb,
|
| 1610 |
+
.current-media-gallery > div::-webkit-scrollbar-thumb,
|
| 1611 |
+
.current-media-gallery .gallery-container::-webkit-scrollbar-thumb {
|
| 1612 |
+
background: #c1c1c1 !important;
|
| 1613 |
+
border-radius: 4px !important;
|
| 1614 |
+
}
|
| 1615 |
+
|
| 1616 |
+
.current-media-gallery::-webkit-scrollbar-thumb:hover,
|
| 1617 |
+
.current-media-gallery > div::-webkit-scrollbar-thumb:hover,
|
| 1618 |
+
.current-media-gallery .gallery-container::-webkit-scrollbar-thumb:hover {
|
| 1619 |
+
background: #a8a8a8 !important;
|
| 1620 |
+
}
|
| 1621 |
+
|
| 1622 |
+
/* 隐藏Current Media的不必要元素 */
|
| 1623 |
+
.current-media-gallery .upload-container,
|
| 1624 |
+
.current-media-gallery .drop-zone,
|
| 1625 |
+
.current-media-gallery .file-upload,
|
| 1626 |
+
.current-media-gallery .upload-text,
|
| 1627 |
+
.current-media-gallery .drop-text {
|
| 1628 |
+
display: none !important;
|
| 1629 |
+
}
|
| 1630 |
+
|
| 1631 |
+
.current-media-gallery .clear-button,
|
| 1632 |
+
.current-media-gallery .delete-button,
|
| 1633 |
+
.current-media-gallery .remove-button {
|
| 1634 |
+
display: none !important;
|
| 1635 |
+
}
|
| 1636 |
+
|
| 1637 |
+
/* 当Gallery为空时隐藏标签和占位文本 */
|
| 1638 |
+
.current-media-gallery:not([style*="display: none"]) .gallery-container:empty::after {
|
| 1639 |
+
content: "";
|
| 1640 |
+
display: none;
|
| 1641 |
+
}
|
| 1642 |
+
|
| 1643 |
+
.current-media-gallery .empty-gallery-text,
|
| 1644 |
+
.current-media-gallery .placeholder-text {
|
| 1645 |
+
display: none !important;
|
| 1646 |
+
}
|
| 1647 |
+
|
| 1648 |
+
/* 确保滚动条不会影响到其他组件 */
|
| 1649 |
+
.current-media-gallery {
|
| 1650 |
+
isolation: isolate !important;
|
| 1651 |
+
}
|
| 1652 |
+
|
| 1653 |
+
/* 重置其他Gallery组件的滚动条样式,防止被污染 */
|
| 1654 |
+
.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar {
|
| 1655 |
+
width: initial !important;
|
| 1656 |
+
}
|
| 1657 |
+
|
| 1658 |
+
.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-track {
|
| 1659 |
+
background: initial !important;
|
| 1660 |
+
border-radius: initial !important;
|
| 1661 |
+
}
|
| 1662 |
+
|
| 1663 |
+
.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-thumb {
|
| 1664 |
+
background: initial !important;
|
| 1665 |
+
border-radius: initial !important;
|
| 1666 |
+
}
|
| 1667 |
+
|
| 1668 |
+
/* 确保chatbot不受影响 */
|
| 1669 |
+
.thinking-chatbot::-webkit-scrollbar {
|
| 1670 |
+
width: initial !important;
|
| 1671 |
+
}
|
| 1672 |
+
|
| 1673 |
+
.thinking-chatbot::-webkit-scrollbar-track {
|
| 1674 |
+
background: initial !important;
|
| 1675 |
+
}
|
| 1676 |
+
|
| 1677 |
+
.thinking-chatbot::-webkit-scrollbar-thumb {
|
| 1678 |
+
background: initial !important;
|
| 1679 |
+
}
|
| 1680 |
+
|
| 1681 |
+
/* 思考过程和正式回答的样式 */
|
| 1682 |
+
.response-container {
|
| 1683 |
+
margin: 10px 0;
|
| 1684 |
+
}
|
| 1685 |
+
|
| 1686 |
+
.thinking-section {
|
| 1687 |
+
background: linear-gradient(135deg, #f8f9ff 0%, #f0f4ff 100%);
|
| 1688 |
+
border: 1px solid #d1d9ff;
|
| 1689 |
+
border-radius: 12px;
|
| 1690 |
+
padding: 16px;
|
| 1691 |
+
margin-bottom: 0px;
|
| 1692 |
+
box-shadow: 0 2px 8px rgba(67, 90, 235, 0.1);
|
| 1693 |
+
}
|
| 1694 |
+
|
| 1695 |
+
.thinking-header {
|
| 1696 |
+
font-weight: 600;
|
| 1697 |
+
color: #4c5aa3;
|
| 1698 |
+
font-size: 14px;
|
| 1699 |
+
margin-bottom: 12px;
|
| 1700 |
+
display: flex;
|
| 1701 |
+
align-items: center;
|
| 1702 |
+
gap: 8px;
|
| 1703 |
+
}
|
| 1704 |
+
|
| 1705 |
+
.thinking-content {
|
| 1706 |
+
color: #5a6ba8;
|
| 1707 |
+
font-size: 13px;
|
| 1708 |
+
line-height: 1;
|
| 1709 |
+
font-style: italic;
|
| 1710 |
+
background: rgba(255, 255, 255, 0.6);
|
| 1711 |
+
padding: 12px;
|
| 1712 |
+
border-radius: 8px;
|
| 1713 |
+
border-left: 3px solid #4c5aa3;
|
| 1714 |
+
white-space: pre-wrap;
|
| 1715 |
+
}
|
| 1716 |
+
|
| 1717 |
+
.formal-section {
|
| 1718 |
+
background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%);
|
| 1719 |
+
border: 1px solid #e9ecef;
|
| 1720 |
+
border-radius: 12px;
|
| 1721 |
+
padding: 16px;
|
| 1722 |
+
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
| 1723 |
+
}
|
| 1724 |
+
|
| 1725 |
+
.formal-header {
|
| 1726 |
+
font-weight: 600;
|
| 1727 |
+
color: #28a745;
|
| 1728 |
+
font-size: 14px;
|
| 1729 |
+
margin-bottom: 12px;
|
| 1730 |
+
display: flex;
|
| 1731 |
+
align-items: center;
|
| 1732 |
+
gap: 8px;
|
| 1733 |
+
}
|
| 1734 |
+
|
| 1735 |
+
.formal-content {
|
| 1736 |
+
color: #333;
|
| 1737 |
+
font-size: 14px;
|
| 1738 |
+
line-height: 1;
|
| 1739 |
+
white-space: pre-wrap;
|
| 1740 |
+
}
|
| 1741 |
+
|
| 1742 |
+
/* 聊天机器人容器样式 */
|
| 1743 |
+
.thinking-chatbot .message {
|
| 1744 |
+
border-radius: 12px;
|
| 1745 |
+
overflow: visible;
|
| 1746 |
+
margin-top: 0 !important;
|
| 1747 |
+
margin-bottom: 0 !important;
|
| 1748 |
+
}
|
| 1749 |
+
|
| 1750 |
+
.thinking-chatbot .message-wrap {
|
| 1751 |
+
margin-top: 0 !important;
|
| 1752 |
+
margin-bottom: 0 !important;
|
| 1753 |
+
}
|
| 1754 |
+
|
| 1755 |
+
.thinking-chatbot .message.bot {
|
| 1756 |
+
background: transparent !important;
|
| 1757 |
+
border: none !important;
|
| 1758 |
+
padding: 8px !important;
|
| 1759 |
+
}
|
| 1760 |
+
|
| 1761 |
+
.thinking-chatbot .message.bot .content {
|
| 1762 |
+
background: transparent !important;
|
| 1763 |
+
}
|
| 1764 |
+
"""
|
| 1765 |
+
|
| 1766 |
+
introduction = """
|
| 1767 |
+
## Features:
|
| 1768 |
+
1. Chat with single image
|
| 1769 |
+
2. Chat with multiple images
|
| 1770 |
+
3. Chat with video
|
| 1771 |
+
4. Streaming Mode: Real-time response streaming
|
| 1772 |
+
5. Thinking Mode: Show model reasoning process
|
| 1773 |
+
|
| 1774 |
+
Click `How to use` tab to see examples.
|
| 1775 |
+
"""
|
| 1776 |
+
|
| 1777 |
+
|
| 1778 |
+
# 主应用
|
| 1779 |
+
def create_app():
|
| 1780 |
+
with gr.Blocks(css=css) as demo:
|
| 1781 |
+
with gr.Tab(model_name):
|
| 1782 |
+
with gr.Row():
|
| 1783 |
+
with gr.Column(scale=1, min_width=300):
|
| 1784 |
+
gr.Markdown(value=introduction)
|
| 1785 |
+
params_form = create_component(form_radio, comp='Radio')
|
| 1786 |
+
thinking_mode = create_component(thinking_checkbox, comp='Checkbox')
|
| 1787 |
+
streaming_mode = create_component(streaming_checkbox, comp='Checkbox')
|
| 1788 |
+
|
| 1789 |
+
fps_setting = create_component(fps_slider, comp='Slider')
|
| 1790 |
+
regenerate = create_component({'value': 'Regenerate'}, comp='Button')
|
| 1791 |
+
clear_button = create_component({'value': 'Clear History'}, comp='Button')
|
| 1792 |
+
|
| 1793 |
+
stop_button = gr.Button("Stop", visible=False)
|
| 1794 |
+
|
| 1795 |
+
with gr.Column(scale=3, min_width=500):
|
| 1796 |
+
initial_session_id = uuid.uuid4().hex[:16]
|
| 1797 |
+
print(f"[会话] 初始化会话,生成session_id: {initial_session_id}")
|
| 1798 |
+
app_session = gr.State({
|
| 1799 |
+
'sts': None, 'ctx': [], 'images_cnt': 0, 'videos_cnt': 0,
|
| 1800 |
+
'chat_type': 'Chat', 'stop_streaming': False, 'is_streaming': False,
|
| 1801 |
+
'session_id': initial_session_id, 'media_cache': [], 'last_thinking_mode': False
|
| 1802 |
+
})
|
| 1803 |
+
with gr.Row():
|
| 1804 |
+
with gr.Column(scale=4):
|
| 1805 |
+
chat_bot = gr.Chatbot(
|
| 1806 |
+
label=f"Chat with {model_name}",
|
| 1807 |
+
value=copy.deepcopy(init_conversation),
|
| 1808 |
+
height=600,
|
| 1809 |
+
elem_classes="thinking-chatbot"
|
| 1810 |
+
)
|
| 1811 |
+
with gr.Column(scale=1, min_width=200):
|
| 1812 |
+
current_images = gr.Gallery(
|
| 1813 |
+
label="Current Media",
|
| 1814 |
+
show_label=True,
|
| 1815 |
+
elem_id="current_media",
|
| 1816 |
+
elem_classes="current-media-gallery",
|
| 1817 |
+
columns=1,
|
| 1818 |
+
rows=1, # 设为1行,让内容可以垂直滚动
|
| 1819 |
+
height=600,
|
| 1820 |
+
visible=False,
|
| 1821 |
+
container=True, # 启用容器模式
|
| 1822 |
+
allow_preview=True, # 允许预览
|
| 1823 |
+
show_download_button=False, # 隐藏下载按钮
|
| 1824 |
+
interactive=False, # 禁用交互,防止用户上传/删除
|
| 1825 |
+
show_share_button=False # 隐藏分享按钮
|
| 1826 |
+
)
|
| 1827 |
+
|
| 1828 |
+
with gr.Tab("Chat") as chat_tab:
|
| 1829 |
+
chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
|
| 1830 |
+
|
| 1831 |
+
with gr.Row():
|
| 1832 |
+
with gr.Column(scale=4):
|
| 1833 |
+
txt_input = gr.Textbox(
|
| 1834 |
+
placeholder="Type your message here...",
|
| 1835 |
+
label="Message",
|
| 1836 |
+
lines=2
|
| 1837 |
+
)
|
| 1838 |
+
with gr.Column(scale=1):
|
| 1839 |
+
submit_btn = gr.Button("Submit", variant="primary")
|
| 1840 |
+
|
| 1841 |
+
with gr.Row():
|
| 1842 |
+
with gr.Column():
|
| 1843 |
+
file_upload = create_multimodal_input()
|
| 1844 |
+
# 添加图片预览组件
|
| 1845 |
+
file_preview = gr.Gallery(
|
| 1846 |
+
label="Uploaded Files Preview",
|
| 1847 |
+
show_label=True,
|
| 1848 |
+
elem_id="file_preview",
|
| 1849 |
+
columns=3,
|
| 1850 |
+
rows=2,
|
| 1851 |
+
height="auto",
|
| 1852 |
+
visible=False
|
| 1853 |
+
)
|
| 1854 |
+
|
| 1855 |
+
# 添加文件上传时的预览更新
|
| 1856 |
+
def update_file_preview(files):
|
| 1857 |
+
if files:
|
| 1858 |
+
# 过滤出图片文件进行预览
|
| 1859 |
+
image_files = []
|
| 1860 |
+
for file in files:
|
| 1861 |
+
if hasattr(file, 'name'):
|
| 1862 |
+
file_path = file.name
|
| 1863 |
+
else:
|
| 1864 |
+
file_path = str(file)
|
| 1865 |
+
|
| 1866 |
+
# 检查是否是图片文件
|
| 1867 |
+
if any(file_path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']):
|
| 1868 |
+
image_files.append(file_path)
|
| 1869 |
+
|
| 1870 |
+
if image_files:
|
| 1871 |
+
return gr.update(value=image_files, visible=True)
|
| 1872 |
+
|
| 1873 |
+
return gr.update(value=[], visible=False)
|
| 1874 |
+
|
| 1875 |
+
file_upload.change(
|
| 1876 |
+
update_file_preview,
|
| 1877 |
+
inputs=[file_upload],
|
| 1878 |
+
outputs=[file_preview]
|
| 1879 |
+
)
|
| 1880 |
+
|
| 1881 |
+
# 创建一个包装函数来处理新的输入格式
|
| 1882 |
+
def handle_submit(message, files, chat_bot, current_images_gallery, app_session, params_form, thinking_mode, streaming_mode, fps_setting):
|
| 1883 |
+
print(f"[handle_submit] 收到输入: message='{message}', files={files}, chat_bot长度={len(chat_bot)}")
|
| 1884 |
+
|
| 1885 |
+
# 如果消息为空且没有文件,直接返回
|
| 1886 |
+
if not message and not files:
|
| 1887 |
+
print("[handle_submit] 消息和文件都为空,直接返回")
|
| 1888 |
+
return message, files, chat_bot, current_images_gallery, app_session, gr.update(visible=False)
|
| 1889 |
+
|
| 1890 |
+
# 模拟原来的 MultimodalInput 格式
|
| 1891 |
+
class MockInput:
|
| 1892 |
+
def __init__(self, text, files):
|
| 1893 |
+
self.text = text
|
| 1894 |
+
self.files = files if files else []
|
| 1895 |
+
|
| 1896 |
+
mock_question = MockInput(message, files)
|
| 1897 |
+
print(f"[handle_submit] 创建MockInput: text='{mock_question.text}', files={len(mock_question.files)}")
|
| 1898 |
+
|
| 1899 |
+
# respond 函数返回生成器,我们需要逐步yield结果
|
| 1900 |
+
result_generator = respond(mock_question, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting)
|
| 1901 |
+
|
| 1902 |
+
# 如果是生成器,逐步yield
|
| 1903 |
+
if hasattr(result_generator, '__iter__') and not isinstance(result_generator, (str, bytes, tuple)):
|
| 1904 |
+
print("[handle_submit] 使用生成器模式")
|
| 1905 |
+
for result in result_generator:
|
| 1906 |
+
new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result
|
| 1907 |
+
print(f"[handle_submit] yield结果: chat_bot长度={len(updated_chat_bot)}")
|
| 1908 |
+
|
| 1909 |
+
# 更新媒体显示
|
| 1910 |
+
media_gallery_update = update_media_gallery(updated_app_session)
|
| 1911 |
+
|
| 1912 |
+
# 返回正确的输出格式
|
| 1913 |
+
yield "", None, updated_chat_bot, media_gallery_update, updated_app_session, stop_btn_update
|
| 1914 |
+
else:
|
| 1915 |
+
print("[handle_submit] 使用非生成器模式")
|
| 1916 |
+
# 如果不是生成器,直接返回
|
| 1917 |
+
new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result_generator
|
| 1918 |
+
print(f"[handle_submit] 直接返回结果: chat_bot长度={len(updated_chat_bot)}")
|
| 1919 |
+
|
| 1920 |
+
# 更新图片显示
|
| 1921 |
+
image_gallery_update = update_image_gallery(updated_app_session)
|
| 1922 |
+
|
| 1923 |
+
yield "", None, updated_chat_bot, image_gallery_update, updated_app_session, stop_btn_update
|
| 1924 |
+
|
| 1925 |
+
submit_btn.click(
|
| 1926 |
+
handle_submit,
|
| 1927 |
+
[txt_input, file_upload, chat_bot, current_images, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
|
| 1928 |
+
[txt_input, file_upload, chat_bot, current_images, app_session, stop_button]
|
| 1929 |
+
)
|
| 1930 |
+
|
| 1931 |
+
with gr.Tab("Few Shot", visible=False) as fewshot_tab:
|
| 1932 |
+
fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
|
| 1933 |
+
with gr.Row():
|
| 1934 |
+
with gr.Column(scale=1):
|
| 1935 |
+
image_input = gr.Image(type="filepath", sources=["upload"])
|
| 1936 |
+
with gr.Column(scale=3):
|
| 1937 |
+
user_message = gr.Textbox(label="User")
|
| 1938 |
+
assistant_message = gr.Textbox(label="Assistant")
|
| 1939 |
+
with gr.Row():
|
| 1940 |
+
add_demonstration_button = gr.Button("Add Example")
|
| 1941 |
+
generate_button = gr.Button(value="Generate", variant="primary")
|
| 1942 |
+
|
| 1943 |
+
add_demonstration_button.click(
|
| 1944 |
+
fewshot_add_demonstration,
|
| 1945 |
+
[image_input, user_message, assistant_message, chat_bot, app_session],
|
| 1946 |
+
[image_input, user_message, assistant_message, chat_bot, app_session]
|
| 1947 |
+
)
|
| 1948 |
+
generate_button.click(
|
| 1949 |
+
fewshot_respond,
|
| 1950 |
+
[image_input, user_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
|
| 1951 |
+
[image_input, user_message, assistant_message, chat_bot, app_session]
|
| 1952 |
+
)
|
| 1953 |
+
|
| 1954 |
+
chat_tab.select(
|
| 1955 |
+
select_chat_type,
|
| 1956 |
+
[chat_tab_label, app_session],
|
| 1957 |
+
[app_session]
|
| 1958 |
+
)
|
| 1959 |
+
chat_tab.select(
|
| 1960 |
+
clear,
|
| 1961 |
+
[txt_input, file_upload, chat_bot, app_session],
|
| 1962 |
+
[txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
|
| 1963 |
+
)
|
| 1964 |
+
fewshot_tab.select(
|
| 1965 |
+
select_chat_type,
|
| 1966 |
+
[fewshot_tab_label, app_session],
|
| 1967 |
+
[app_session]
|
| 1968 |
+
)
|
| 1969 |
+
fewshot_tab.select(
|
| 1970 |
+
clear,
|
| 1971 |
+
[txt_input, file_upload, chat_bot, app_session],
|
| 1972 |
+
[txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
|
| 1973 |
+
)
|
| 1974 |
+
# chat_bot.flushed(flushed, outputs=[txt_input]) # 标准 Chatbot 可能不支持 flushed
|
| 1975 |
+
|
| 1976 |
+
params_form.change(
|
| 1977 |
+
update_streaming_mode_state,
|
| 1978 |
+
inputs=[params_form],
|
| 1979 |
+
outputs=[streaming_mode]
|
| 1980 |
+
)
|
| 1981 |
+
|
| 1982 |
+
regenerate.click(
|
| 1983 |
+
regenerate_button_clicked,
|
| 1984 |
+
[txt_input, image_input, user_message, assistant_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
|
| 1985 |
+
[txt_input, image_input, user_message, assistant_message, chat_bot, app_session]
|
| 1986 |
+
)
|
| 1987 |
+
clear_button.click(
|
| 1988 |
+
clear,
|
| 1989 |
+
[txt_input, file_upload, chat_bot, app_session],
|
| 1990 |
+
[txt_input, file_upload, file_preview, current_images, chat_bot, app_session, image_input, user_message, assistant_message]
|
| 1991 |
+
)
|
| 1992 |
+
|
| 1993 |
+
stop_button.click(
|
| 1994 |
+
stop_button_clicked,
|
| 1995 |
+
[app_session],
|
| 1996 |
+
[app_session, stop_button]
|
| 1997 |
+
)
|
| 1998 |
+
|
| 1999 |
+
return demo
|
| 2000 |
+
|
| 2001 |
+
|
| 2002 |
+
if __name__ == "__main__":
|
| 2003 |
+
# 解析命令行参数
|
| 2004 |
+
parser = argparse.ArgumentParser(description='Web Demo for MiniCPM-V 4.5')
|
| 2005 |
+
parser.add_argument('--port', type=int, default=7860, help='Port to run the web demo on')
|
| 2006 |
+
parser.add_argument('--no-parallel-encoding', action='store_true', help='Disable parallel image encoding')
|
| 2007 |
+
parser.add_argument('--parallel-processes', type=int, default=None, help='Number of parallel processes for image encoding')
|
| 2008 |
+
args = parser.parse_args()
|
| 2009 |
+
|
| 2010 |
+
# 配置并行编码
|
| 2011 |
+
if args.no_parallel_encoding:
|
| 2012 |
+
ENABLE_PARALLEL_ENCODING = False
|
| 2013 |
+
print("[性能优化] 并行图像编码已禁用")
|
| 2014 |
+
else:
|
| 2015 |
+
ENABLE_PARALLEL_ENCODING = True
|
| 2016 |
+
print("[性能优化] 并行图像编码已启用")
|
| 2017 |
+
|
| 2018 |
+
if args.parallel_processes:
|
| 2019 |
+
PARALLEL_PROCESSES = args.parallel_processes
|
| 2020 |
+
print(f"[性能优化] 设置并行进程数为: {PARALLEL_PROCESSES}")
|
| 2021 |
+
else:
|
| 2022 |
+
print(f"[性能优化] 自动检测并行进程数,CPU核心数: {mp.cpu_count()}")
|
| 2023 |
+
|
| 2024 |
+
# 初始化模型
|
| 2025 |
+
initialize_model()
|
| 2026 |
+
|
| 2027 |
+
# 创建并启动应用
|
| 2028 |
+
demo = create_app()
|
| 2029 |
+
demo.launch(
|
| 2030 |
+
share=False,
|
| 2031 |
+
debug=True,
|
| 2032 |
+
show_api=False,
|
| 2033 |
+
server_port=args.port,
|
| 2034 |
+
server_name="0.0.0.0"
|
| 2035 |
+
)
|
logging_util.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# setup root logger
|
| 6 |
+
def setup_root_logger(log_level=logging.INFO, dist_rank=0, local_dir=''):
|
| 7 |
+
"""
|
| 8 |
+
log_level: logging level
|
| 9 |
+
dist_rank: process rank for distributed training
|
| 10 |
+
local_dir: local log path, default None
|
| 11 |
+
"""
|
| 12 |
+
logger = logging.getLogger() # setup root logger for all
|
| 13 |
+
for handler in logger.handlers:
|
| 14 |
+
logger.removeHandler(handler)
|
| 15 |
+
# create formatter
|
| 16 |
+
fmt = '[%(asctime)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
|
| 17 |
+
#color_fmt = colored('[%(asctime)s]', 'green') + \
|
| 18 |
+
# colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
|
| 19 |
+
|
| 20 |
+
# create console handlers for master process
|
| 21 |
+
if dist_rank == 0:
|
| 22 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 23 |
+
console_handler.setFormatter(
|
| 24 |
+
logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
|
| 25 |
+
logger.addHandler(console_handler)
|
| 26 |
+
|
| 27 |
+
# create file handlers
|
| 28 |
+
if local_dir:
|
| 29 |
+
os.makedirs(local_dir, exist_ok=True)
|
| 30 |
+
file_handler = logging.FileHandler(os.path.join(local_dir, f'log_rank{dist_rank}.log'), mode='a')
|
| 31 |
+
file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
|
| 32 |
+
logger.addHandler(file_handler)
|
| 33 |
+
|
| 34 |
+
logger.setLevel(log_level)
|
models/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .minicpmv4_5 import ModelMiniCPMV4_5
|
models/minicpmv4_5.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
import torch
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import base64
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import logging
|
| 9 |
+
from transformers import AutoModel, AutoTokenizer, AutoProcessor, set_seed
|
| 10 |
+
# set_seed(42)
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
class ModelMiniCPMV4_5:
|
| 15 |
+
def __init__(self, path) -> None:
|
| 16 |
+
self.model = AutoModel.from_pretrained(
|
| 17 |
+
path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16, device_map="auto")
|
| 18 |
+
self.model.eval()
|
| 19 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 20 |
+
path, trust_remote_code=True)
|
| 21 |
+
self.processor = AutoProcessor.from_pretrained(
|
| 22 |
+
path, trust_remote_code=True)
|
| 23 |
+
|
| 24 |
+
def __call__(self, input_data):
|
| 25 |
+
image = None
|
| 26 |
+
if "image" in input_data and len(input_data["image"]) > 10:
|
| 27 |
+
image = Image.open(BytesIO(base64.b64decode(
|
| 28 |
+
input_data["image"]))).convert('RGB')
|
| 29 |
+
|
| 30 |
+
msgs = input_data["question"]
|
| 31 |
+
params = input_data.get("params", "{}")
|
| 32 |
+
params = json.loads(params)
|
| 33 |
+
msgs = json.loads(msgs)
|
| 34 |
+
|
| 35 |
+
temporal_ids = input_data.get("temporal_ids", None)
|
| 36 |
+
if temporal_ids:
|
| 37 |
+
temporal_ids = json.loads(temporal_ids)
|
| 38 |
+
|
| 39 |
+
if params.get("max_new_tokens", 0) > 16384:
|
| 40 |
+
logger.info(f"make max_new_tokens=16384, reducing limit to save memory")
|
| 41 |
+
params["max_new_tokens"] = 16384
|
| 42 |
+
if params.get("max_inp_length", 0) > 2048 * 10:
|
| 43 |
+
logger.info(f"make max_inp_length={2048 * 10}, keeping high limit for video processing")
|
| 44 |
+
params["max_inp_length"] = 2048 * 10
|
| 45 |
+
|
| 46 |
+
for msg in msgs:
|
| 47 |
+
if 'content' in msg:
|
| 48 |
+
contents = msg['content']
|
| 49 |
+
else:
|
| 50 |
+
contents = msg.pop('contents')
|
| 51 |
+
|
| 52 |
+
new_cnts = []
|
| 53 |
+
for c in contents:
|
| 54 |
+
if isinstance(c, dict):
|
| 55 |
+
if c['type'] == 'text':
|
| 56 |
+
c = c['pairs']
|
| 57 |
+
elif c['type'] == 'image':
|
| 58 |
+
c = Image.open(
|
| 59 |
+
BytesIO(base64.b64decode(c["pairs"]))).convert('RGB')
|
| 60 |
+
else:
|
| 61 |
+
raise ValueError(
|
| 62 |
+
"contents type only support text and image.")
|
| 63 |
+
new_cnts.append(c)
|
| 64 |
+
msg['content'] = new_cnts
|
| 65 |
+
logger.info(f'msgs: {str(msgs)}')
|
| 66 |
+
|
| 67 |
+
enable_thinking = params.pop('enable_thinking', True)
|
| 68 |
+
is_streaming = params.pop('stream', False)
|
| 69 |
+
|
| 70 |
+
if is_streaming:
|
| 71 |
+
return self._stream_chat(image, msgs, enable_thinking, params, temporal_ids)
|
| 72 |
+
else:
|
| 73 |
+
chat_kwargs = {
|
| 74 |
+
"image": image,
|
| 75 |
+
"msgs": msgs,
|
| 76 |
+
"tokenizer": self.tokenizer,
|
| 77 |
+
"processor": self.processor,
|
| 78 |
+
"enable_thinking": enable_thinking,
|
| 79 |
+
**params
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
if temporal_ids is not None:
|
| 83 |
+
chat_kwargs["temporal_ids"] = temporal_ids
|
| 84 |
+
|
| 85 |
+
answer = self.model.chat(**chat_kwargs)
|
| 86 |
+
|
| 87 |
+
res = re.sub(r'(<box>.*</box>)', '', answer)
|
| 88 |
+
res = res.replace('<ref>', '')
|
| 89 |
+
res = res.replace('</ref>', '')
|
| 90 |
+
res = res.replace('<box>', '')
|
| 91 |
+
answer = res.replace('</box>', '')
|
| 92 |
+
if not enable_thinking:
|
| 93 |
+
print(f"enable_thinking: {enable_thinking}")
|
| 94 |
+
answer = answer.replace('</think>', '')
|
| 95 |
+
|
| 96 |
+
oids = self.tokenizer.encode(answer)
|
| 97 |
+
output_tokens = len(oids)
|
| 98 |
+
return answer, output_tokens
|
| 99 |
+
|
| 100 |
+
def _stream_chat(self, image, msgs, enable_thinking, params, temporal_ids=None):
|
| 101 |
+
try:
|
| 102 |
+
params['stream'] = True
|
| 103 |
+
chat_kwargs = {
|
| 104 |
+
"image": image,
|
| 105 |
+
"msgs": msgs,
|
| 106 |
+
"tokenizer": self.tokenizer,
|
| 107 |
+
"processor": self.processor,
|
| 108 |
+
"enable_thinking": enable_thinking,
|
| 109 |
+
**params
|
| 110 |
+
}
|
| 111 |
+
if temporal_ids is not None:
|
| 112 |
+
chat_kwargs["temporal_ids"] = temporal_ids
|
| 113 |
+
|
| 114 |
+
answer_generator = self.model.chat(**chat_kwargs)
|
| 115 |
+
|
| 116 |
+
if not hasattr(answer_generator, '__iter__'):
|
| 117 |
+
answer = answer_generator
|
| 118 |
+
res = re.sub(r'(<box>.*</box>)', '', answer)
|
| 119 |
+
res = res.replace('<ref>', '')
|
| 120 |
+
res = res.replace('</ref>', '')
|
| 121 |
+
res = res.replace('<box>', '')
|
| 122 |
+
answer = res.replace('</box>', '')
|
| 123 |
+
if not enable_thinking:
|
| 124 |
+
answer = answer.replace('</think>', '')
|
| 125 |
+
|
| 126 |
+
char_count = 0
|
| 127 |
+
for char in answer:
|
| 128 |
+
yield char
|
| 129 |
+
char_count += 1
|
| 130 |
+
else:
|
| 131 |
+
full_answer = ""
|
| 132 |
+
chunk_count = 0
|
| 133 |
+
char_count = 0
|
| 134 |
+
|
| 135 |
+
for chunk in answer_generator:
|
| 136 |
+
if isinstance(chunk, str):
|
| 137 |
+
clean_chunk = re.sub(r'(<box>.*</box>)', '', chunk)
|
| 138 |
+
clean_chunk = clean_chunk.replace('<ref>', '')
|
| 139 |
+
clean_chunk = clean_chunk.replace('</ref>', '')
|
| 140 |
+
clean_chunk = clean_chunk.replace('<box>', '')
|
| 141 |
+
clean_chunk = clean_chunk.replace('</box>', '')
|
| 142 |
+
|
| 143 |
+
if not enable_thinking:
|
| 144 |
+
clean_chunk = clean_chunk.replace('</think>', '')
|
| 145 |
+
|
| 146 |
+
full_answer += chunk
|
| 147 |
+
char_count += len(clean_chunk)
|
| 148 |
+
chunk_count += 1
|
| 149 |
+
yield clean_chunk
|
| 150 |
+
else:
|
| 151 |
+
full_answer += str(chunk)
|
| 152 |
+
char_count += len(str(chunk))
|
| 153 |
+
chunk_count += 1
|
| 154 |
+
yield str(chunk)
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Stream chat error: {e}")
|
| 158 |
+
yield f"Error: {str(e)}"
|
requirements.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
huggingface_hub
|
| 2 |
+
# Core dependencies
|
| 3 |
+
spaces
|
| 4 |
+
gradio==4.44.1
|
| 5 |
+
torch==2.7.1
|
| 6 |
+
torchvision==0.22.1
|
| 7 |
+
numpy==2.2.6
|
| 8 |
+
pillow
|
| 9 |
+
scipy
|
| 10 |
+
pandas==2.3.1
|
| 11 |
+
|
| 12 |
+
# ML/AI dependencies
|
| 13 |
+
transformers==4.55.0
|
| 14 |
+
accelerate==1.9.0
|
| 15 |
+
einops==0.8.1
|
| 16 |
+
timm==1.0.19
|
| 17 |
+
safetensors==0.5.3
|
| 18 |
+
tokenizers==0.21.4
|
| 19 |
+
huggingface-hub==0.34.3
|
| 20 |
+
|
| 21 |
+
# Video processing
|
| 22 |
+
decord==0.6.0
|
| 23 |
+
ffmpy==0.6.1
|
| 24 |
+
pydub==0.25.1
|
| 25 |
+
|
| 26 |
+
# Utilities
|
| 27 |
+
requests==2.32.4
|
| 28 |
+
tqdm==4.67.1
|
| 29 |
+
PyYAML==6.0.2
|
| 30 |
+
psutil==7.0.0
|
| 31 |
+
pydantic==2.10.6
|
| 32 |
+
|
| 33 |
+
# Visualization
|
| 34 |
+
matplotlib==3.10.5
|
| 35 |
+
|
| 36 |
+
# Optional: For CUDA support (if needed)
|
| 37 |
+
# triton==3.3.1
|