yaya-sy commited on
Commit
b34bbf6
Β·
verified Β·
1 Parent(s): 01b6957

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -27
app.py CHANGED
@@ -189,20 +189,66 @@ def model_inference(input_dict, history):
189
  audio_path = tts(buffer)
190
  return audio_path # Return the audio file path
191
 
192
- # Option 1: Use regular Interface with streaming (recommended)
193
  with gr.Blocks() as demo:
194
  gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
195
 
196
- chatbot = gr.Chatbot()
197
- msg = gr.MultimodalTextbox(
198
- label="Query Input",
199
- file_types=["image", "video"],
200
- file_count="multiple"
201
- )
202
- audio_output = gr.Audio(label="Generated Speech")
203
- clear = gr.Button("Clear")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  def respond(message, chat_history):
 
 
 
206
  # Add user message to chat history
207
  bot_message = ""
208
  chat_history.append([message["text"], ""])
@@ -211,36 +257,25 @@ with gr.Blocks() as demo:
211
  for response in model_inference(message, chat_history):
212
  bot_message = response
213
  chat_history[-1][1] = bot_message
214
- yield "", chat_history, None
215
 
216
  # Generate audio after streaming is complete
217
  try:
218
  if bot_message.strip(): # Only generate TTS if there's actual text
219
  audio_path = tts(bot_message)
220
  if audio_path:
221
- yield "", chat_history, audio_path
222
  else:
223
  print("TTS returned None or empty result")
224
- yield "", chat_history, None
225
  else:
226
- yield "", chat_history, None
227
  except Exception as e:
228
  print(f"TTS Error: {e}")
229
- yield "", chat_history, None
230
 
231
- msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
232
- clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
233
-
234
- # Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
235
- # demo = gr.ChatInterface(
236
- # fn=model_inference,
237
- # description="# oolel-vision-experimental `@video-infer for video understanding`**",
238
- # fill_height=True,
239
- # textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
240
- # stop_btn="Stop Generation",
241
- # multimodal=True,
242
- # cache_examples=False,
243
- # )
244
 
245
  if __name__ == "__main__":
246
  demo.launch(debug=True)
 
189
  audio_path = tts(buffer)
190
  return audio_path # Return the audio file path
191
 
192
+ # Main interface with image preview
193
  with gr.Blocks() as demo:
194
  gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
195
 
196
+ with gr.Row():
197
+ with gr.Column(scale=2):
198
+ chatbot = gr.Chatbot(type="messages")
199
+ msg = gr.MultimodalTextbox(
200
+ label="Query Input",
201
+ file_types=["image", "video"],
202
+ file_count="multiple"
203
+ )
204
+ clear = gr.Button("Clear")
205
+
206
+ with gr.Column(scale=1):
207
+ uploaded_files = gr.Gallery(
208
+ label="Uploaded Files",
209
+ show_label=True,
210
+ elem_id="gallery",
211
+ columns=2,
212
+ rows=2,
213
+ object_fit="contain",
214
+ height="auto"
215
+ )
216
+ audio_output = gr.Audio(label="Generated Speech")
217
+
218
+ def update_gallery(message):
219
+ """Update gallery with uploaded files"""
220
+ if message and "files" in message and message["files"]:
221
+ # Filter for image files only (videos won't display properly in gallery)
222
+ image_files = []
223
+ for file_path in message["files"]:
224
+ try:
225
+ # Check if it's an image by trying to open it
226
+ with Image.open(file_path) as img:
227
+ image_files.append(file_path)
228
+ except:
229
+ # If it fails, it's probably a video or other file type
230
+ # Generate video thumbnail for videos
231
+ try:
232
+ vidcap = cv2.VideoCapture(file_path)
233
+ success, frame = vidcap.read()
234
+ if success:
235
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
236
+ thumbnail = Image.fromarray(frame)
237
+ # Save thumbnail temporarily
238
+ import tempfile
239
+ temp_thumb = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
240
+ thumbnail.save(temp_thumb.name)
241
+ image_files.append(temp_thumb.name)
242
+ vidcap.release()
243
+ except:
244
+ pass
245
+ return image_files
246
+ return []
247
 
248
  def respond(message, chat_history):
249
+ # Update gallery first
250
+ gallery_files = update_gallery(message)
251
+
252
  # Add user message to chat history
253
  bot_message = ""
254
  chat_history.append([message["text"], ""])
 
257
  for response in model_inference(message, chat_history):
258
  bot_message = response
259
  chat_history[-1][1] = bot_message
260
+ yield "", chat_history, None, gallery_files
261
 
262
  # Generate audio after streaming is complete
263
  try:
264
  if bot_message.strip(): # Only generate TTS if there's actual text
265
  audio_path = tts(bot_message)
266
  if audio_path:
267
+ yield "", chat_history, audio_path, gallery_files
268
  else:
269
  print("TTS returned None or empty result")
270
+ yield "", chat_history, None, gallery_files
271
  else:
272
+ yield "", chat_history, None, gallery_files
273
  except Exception as e:
274
  print(f"TTS Error: {e}")
275
+ yield "", chat_history, None, gallery_files
276
 
277
+ msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output, uploaded_files])
278
+ clear.click(lambda: ([], None, []), outputs=[chatbot, audio_output, uploaded_files])
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  if __name__ == "__main__":
281
  demo.launch(debug=True)