Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
|
| 3 |
from transformers.image_utils import load_image
|
| 4 |
from threading import Thread
|
|
@@ -189,66 +189,20 @@ def model_inference(input_dict, history):
|
|
| 189 |
audio_path = tts(buffer)
|
| 190 |
return audio_path # Return the audio file path
|
| 191 |
|
| 192 |
-
#
|
| 193 |
with gr.Blocks() as demo:
|
| 194 |
gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
clear = gr.Button("Clear")
|
| 205 |
-
|
| 206 |
-
with gr.Column(scale=1):
|
| 207 |
-
uploaded_files = gr.Gallery(
|
| 208 |
-
label="Uploaded Files",
|
| 209 |
-
show_label=True,
|
| 210 |
-
elem_id="gallery",
|
| 211 |
-
columns=2,
|
| 212 |
-
rows=2,
|
| 213 |
-
object_fit="contain",
|
| 214 |
-
height="auto"
|
| 215 |
-
)
|
| 216 |
-
audio_output = gr.Audio(label="Generated Speech")
|
| 217 |
-
|
| 218 |
-
def update_gallery(message):
|
| 219 |
-
"""Update gallery with uploaded files"""
|
| 220 |
-
if message and "files" in message and message["files"]:
|
| 221 |
-
# Filter for image files only (videos won't display properly in gallery)
|
| 222 |
-
image_files = []
|
| 223 |
-
for file_path in message["files"]:
|
| 224 |
-
try:
|
| 225 |
-
# Check if it's an image by trying to open it
|
| 226 |
-
with Image.open(file_path) as img:
|
| 227 |
-
image_files.append(file_path)
|
| 228 |
-
except:
|
| 229 |
-
# If it fails, it's probably a video or other file type
|
| 230 |
-
# Generate video thumbnail for videos
|
| 231 |
-
try:
|
| 232 |
-
vidcap = cv2.VideoCapture(file_path)
|
| 233 |
-
success, frame = vidcap.read()
|
| 234 |
-
if success:
|
| 235 |
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 236 |
-
thumbnail = Image.fromarray(frame)
|
| 237 |
-
# Save thumbnail temporarily
|
| 238 |
-
import tempfile
|
| 239 |
-
temp_thumb = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
|
| 240 |
-
thumbnail.save(temp_thumb.name)
|
| 241 |
-
image_files.append(temp_thumb.name)
|
| 242 |
-
vidcap.release()
|
| 243 |
-
except:
|
| 244 |
-
pass
|
| 245 |
-
return image_files
|
| 246 |
-
return []
|
| 247 |
|
| 248 |
def respond(message, chat_history):
|
| 249 |
-
# Update gallery first
|
| 250 |
-
gallery_files = update_gallery(message)
|
| 251 |
-
|
| 252 |
# Add user message to chat history
|
| 253 |
bot_message = ""
|
| 254 |
chat_history.append([message["text"], ""])
|
|
@@ -257,25 +211,36 @@ with gr.Blocks() as demo:
|
|
| 257 |
for response in model_inference(message, chat_history):
|
| 258 |
bot_message = response
|
| 259 |
chat_history[-1][1] = bot_message
|
| 260 |
-
yield "", chat_history, None
|
| 261 |
|
| 262 |
# Generate audio after streaming is complete
|
| 263 |
try:
|
| 264 |
if bot_message.strip(): # Only generate TTS if there's actual text
|
| 265 |
audio_path = tts(bot_message)
|
| 266 |
if audio_path:
|
| 267 |
-
yield "", chat_history, audio_path
|
| 268 |
else:
|
| 269 |
print("TTS returned None or empty result")
|
| 270 |
-
yield "", chat_history, None
|
| 271 |
else:
|
| 272 |
-
yield "", chat_history, None
|
| 273 |
except Exception as e:
|
| 274 |
print(f"TTS Error: {e}")
|
| 275 |
-
yield "", chat_history, None
|
| 276 |
|
| 277 |
-
msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output
|
| 278 |
-
clear.click(lambda: ([], None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
if __name__ == "__main__":
|
| 281 |
demo.launch(debug=True)
|
|
|
|
| 1 |
+
mport gradio as gr
|
| 2 |
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
|
| 3 |
from transformers.image_utils import load_image
|
| 4 |
from threading import Thread
|
|
|
|
| 189 |
audio_path = tts(buffer)
|
| 190 |
return audio_path # Return the audio file path
|
| 191 |
|
| 192 |
+
# Option 1: Use regular Interface with streaming (recommended)
|
| 193 |
with gr.Blocks() as demo:
|
| 194 |
gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
|
| 195 |
|
| 196 |
+
chatbot = gr.Chatbot()
|
| 197 |
+
msg = gr.MultimodalTextbox(
|
| 198 |
+
label="Query Input",
|
| 199 |
+
file_types=["image", "video"],
|
| 200 |
+
file_count="multiple"
|
| 201 |
+
)
|
| 202 |
+
audio_output = gr.Audio(label="Generated Speech")
|
| 203 |
+
clear = gr.Button("Clear")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
def respond(message, chat_history):
|
|
|
|
|
|
|
|
|
|
| 206 |
# Add user message to chat history
|
| 207 |
bot_message = ""
|
| 208 |
chat_history.append([message["text"], ""])
|
|
|
|
| 211 |
for response in model_inference(message, chat_history):
|
| 212 |
bot_message = response
|
| 213 |
chat_history[-1][1] = bot_message
|
| 214 |
+
yield "", chat_history, None
|
| 215 |
|
| 216 |
# Generate audio after streaming is complete
|
| 217 |
try:
|
| 218 |
if bot_message.strip(): # Only generate TTS if there's actual text
|
| 219 |
audio_path = tts(bot_message)
|
| 220 |
if audio_path:
|
| 221 |
+
yield "", chat_history, audio_path
|
| 222 |
else:
|
| 223 |
print("TTS returned None or empty result")
|
| 224 |
+
yield "", chat_history, None
|
| 225 |
else:
|
| 226 |
+
yield "", chat_history, None
|
| 227 |
except Exception as e:
|
| 228 |
print(f"TTS Error: {e}")
|
| 229 |
+
yield "", chat_history, None
|
| 230 |
|
| 231 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
|
| 232 |
+
clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
|
| 233 |
+
|
| 234 |
+
# Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
|
| 235 |
+
# demo = gr.ChatInterface(
|
| 236 |
+
# fn=model_inference,
|
| 237 |
+
# description="# oolel-vision-experimental `@video-infer for video understanding`**",
|
| 238 |
+
# fill_height=True,
|
| 239 |
+
# textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|
| 240 |
+
# stop_btn="Stop Generation",
|
| 241 |
+
# multimodal=True,
|
| 242 |
+
# cache_examples=False,
|
| 243 |
+
# )
|
| 244 |
|
| 245 |
if __name__ == "__main__":
|
| 246 |
demo.launch(debug=True)
|