Update app.py
Browse files
app.py
CHANGED
|
@@ -189,20 +189,66 @@ def model_inference(input_dict, history):
|
|
| 189 |
audio_path = tts(buffer)
|
| 190 |
return audio_path # Return the audio file path
|
| 191 |
|
| 192 |
-
#
|
| 193 |
with gr.Blocks() as demo:
|
| 194 |
gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
def respond(message, chat_history):
|
|
|
|
|
|
|
|
|
|
| 206 |
# Add user message to chat history
|
| 207 |
bot_message = ""
|
| 208 |
chat_history.append([message["text"], ""])
|
|
@@ -211,36 +257,25 @@ with gr.Blocks() as demo:
|
|
| 211 |
for response in model_inference(message, chat_history):
|
| 212 |
bot_message = response
|
| 213 |
chat_history[-1][1] = bot_message
|
| 214 |
-
yield "", chat_history, None
|
| 215 |
|
| 216 |
# Generate audio after streaming is complete
|
| 217 |
try:
|
| 218 |
if bot_message.strip(): # Only generate TTS if there's actual text
|
| 219 |
audio_path = tts(bot_message)
|
| 220 |
if audio_path:
|
| 221 |
-
yield "", chat_history, audio_path
|
| 222 |
else:
|
| 223 |
print("TTS returned None or empty result")
|
| 224 |
-
yield "", chat_history, None
|
| 225 |
else:
|
| 226 |
-
yield "", chat_history, None
|
| 227 |
except Exception as e:
|
| 228 |
print(f"TTS Error: {e}")
|
| 229 |
-
yield "", chat_history, None
|
| 230 |
|
| 231 |
-
msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
|
| 232 |
-
clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
|
| 233 |
-
|
| 234 |
-
# Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
|
| 235 |
-
# demo = gr.ChatInterface(
|
| 236 |
-
# fn=model_inference,
|
| 237 |
-
# description="# oolel-vision-experimental `@video-infer for video understanding`**",
|
| 238 |
-
# fill_height=True,
|
| 239 |
-
# textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|
| 240 |
-
# stop_btn="Stop Generation",
|
| 241 |
-
# multimodal=True,
|
| 242 |
-
# cache_examples=False,
|
| 243 |
-
# )
|
| 244 |
|
| 245 |
if __name__ == "__main__":
|
| 246 |
demo.launch(debug=True)
|
|
|
|
| 189 |
audio_path = tts(buffer)
|
| 190 |
return audio_path # Return the audio file path
|
| 191 |
|
| 192 |
+
# Main interface with image preview
|
| 193 |
with gr.Blocks() as demo:
|
| 194 |
gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
|
| 195 |
|
| 196 |
+
with gr.Row():
|
| 197 |
+
with gr.Column(scale=2):
|
| 198 |
+
chatbot = gr.Chatbot(type="messages")
|
| 199 |
+
msg = gr.MultimodalTextbox(
|
| 200 |
+
label="Query Input",
|
| 201 |
+
file_types=["image", "video"],
|
| 202 |
+
file_count="multiple"
|
| 203 |
+
)
|
| 204 |
+
clear = gr.Button("Clear")
|
| 205 |
+
|
| 206 |
+
with gr.Column(scale=1):
|
| 207 |
+
uploaded_files = gr.Gallery(
|
| 208 |
+
label="Uploaded Files",
|
| 209 |
+
show_label=True,
|
| 210 |
+
elem_id="gallery",
|
| 211 |
+
columns=2,
|
| 212 |
+
rows=2,
|
| 213 |
+
object_fit="contain",
|
| 214 |
+
height="auto"
|
| 215 |
+
)
|
| 216 |
+
audio_output = gr.Audio(label="Generated Speech")
|
| 217 |
+
|
| 218 |
+
def update_gallery(message):
|
| 219 |
+
"""Update gallery with uploaded files"""
|
| 220 |
+
if message and "files" in message and message["files"]:
|
| 221 |
+
# Filter for image files only (videos won't display properly in gallery)
|
| 222 |
+
image_files = []
|
| 223 |
+
for file_path in message["files"]:
|
| 224 |
+
try:
|
| 225 |
+
# Check if it's an image by trying to open it
|
| 226 |
+
with Image.open(file_path) as img:
|
| 227 |
+
image_files.append(file_path)
|
| 228 |
+
except:
|
| 229 |
+
# If it fails, it's probably a video or other file type
|
| 230 |
+
# Generate video thumbnail for videos
|
| 231 |
+
try:
|
| 232 |
+
vidcap = cv2.VideoCapture(file_path)
|
| 233 |
+
success, frame = vidcap.read()
|
| 234 |
+
if success:
|
| 235 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 236 |
+
thumbnail = Image.fromarray(frame)
|
| 237 |
+
# Save thumbnail temporarily
|
| 238 |
+
import tempfile
|
| 239 |
+
temp_thumb = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
|
| 240 |
+
thumbnail.save(temp_thumb.name)
|
| 241 |
+
image_files.append(temp_thumb.name)
|
| 242 |
+
vidcap.release()
|
| 243 |
+
except:
|
| 244 |
+
pass
|
| 245 |
+
return image_files
|
| 246 |
+
return []
|
| 247 |
|
| 248 |
def respond(message, chat_history):
|
| 249 |
+
# Update gallery first
|
| 250 |
+
gallery_files = update_gallery(message)
|
| 251 |
+
|
| 252 |
# Add user message to chat history
|
| 253 |
bot_message = ""
|
| 254 |
chat_history.append([message["text"], ""])
|
|
|
|
| 257 |
for response in model_inference(message, chat_history):
|
| 258 |
bot_message = response
|
| 259 |
chat_history[-1][1] = bot_message
|
| 260 |
+
yield "", chat_history, None, gallery_files
|
| 261 |
|
| 262 |
# Generate audio after streaming is complete
|
| 263 |
try:
|
| 264 |
if bot_message.strip(): # Only generate TTS if there's actual text
|
| 265 |
audio_path = tts(bot_message)
|
| 266 |
if audio_path:
|
| 267 |
+
yield "", chat_history, audio_path, gallery_files
|
| 268 |
else:
|
| 269 |
print("TTS returned None or empty result")
|
| 270 |
+
yield "", chat_history, None, gallery_files
|
| 271 |
else:
|
| 272 |
+
yield "", chat_history, None, gallery_files
|
| 273 |
except Exception as e:
|
| 274 |
print(f"TTS Error: {e}")
|
| 275 |
+
yield "", chat_history, None, gallery_files
|
| 276 |
|
| 277 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output, uploaded_files])
|
| 278 |
+
clear.click(lambda: ([], None, []), outputs=[chatbot, audio_output, uploaded_files])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
if __name__ == "__main__":
|
| 281 |
demo.launch(debug=True)
|