yaya-sy commited on
Commit
f6e8ca8
Β·
verified Β·
1 Parent(s): b34bbf6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -63
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import gradio as gr
2
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
3
  from transformers.image_utils import load_image
4
  from threading import Thread
@@ -189,66 +189,20 @@ def model_inference(input_dict, history):
189
  audio_path = tts(buffer)
190
  return audio_path # Return the audio file path
191
 
192
- # Main interface with image preview
193
  with gr.Blocks() as demo:
194
  gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
195
 
196
- with gr.Row():
197
- with gr.Column(scale=2):
198
- chatbot = gr.Chatbot(type="messages")
199
- msg = gr.MultimodalTextbox(
200
- label="Query Input",
201
- file_types=["image", "video"],
202
- file_count="multiple"
203
- )
204
- clear = gr.Button("Clear")
205
-
206
- with gr.Column(scale=1):
207
- uploaded_files = gr.Gallery(
208
- label="Uploaded Files",
209
- show_label=True,
210
- elem_id="gallery",
211
- columns=2,
212
- rows=2,
213
- object_fit="contain",
214
- height="auto"
215
- )
216
- audio_output = gr.Audio(label="Generated Speech")
217
-
218
- def update_gallery(message):
219
- """Update gallery with uploaded files"""
220
- if message and "files" in message and message["files"]:
221
- # Filter for image files only (videos won't display properly in gallery)
222
- image_files = []
223
- for file_path in message["files"]:
224
- try:
225
- # Check if it's an image by trying to open it
226
- with Image.open(file_path) as img:
227
- image_files.append(file_path)
228
- except:
229
- # If it fails, it's probably a video or other file type
230
- # Generate video thumbnail for videos
231
- try:
232
- vidcap = cv2.VideoCapture(file_path)
233
- success, frame = vidcap.read()
234
- if success:
235
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
236
- thumbnail = Image.fromarray(frame)
237
- # Save thumbnail temporarily
238
- import tempfile
239
- temp_thumb = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
240
- thumbnail.save(temp_thumb.name)
241
- image_files.append(temp_thumb.name)
242
- vidcap.release()
243
- except:
244
- pass
245
- return image_files
246
- return []
247
 
248
  def respond(message, chat_history):
249
- # Update gallery first
250
- gallery_files = update_gallery(message)
251
-
252
  # Add user message to chat history
253
  bot_message = ""
254
  chat_history.append([message["text"], ""])
@@ -257,25 +211,36 @@ with gr.Blocks() as demo:
257
  for response in model_inference(message, chat_history):
258
  bot_message = response
259
  chat_history[-1][1] = bot_message
260
- yield "", chat_history, None, gallery_files
261
 
262
  # Generate audio after streaming is complete
263
  try:
264
  if bot_message.strip(): # Only generate TTS if there's actual text
265
  audio_path = tts(bot_message)
266
  if audio_path:
267
- yield "", chat_history, audio_path, gallery_files
268
  else:
269
  print("TTS returned None or empty result")
270
- yield "", chat_history, None, gallery_files
271
  else:
272
- yield "", chat_history, None, gallery_files
273
  except Exception as e:
274
  print(f"TTS Error: {e}")
275
- yield "", chat_history, None, gallery_files
276
 
277
- msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output, uploaded_files])
278
- clear.click(lambda: ([], None, []), outputs=[chatbot, audio_output, uploaded_files])
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  if __name__ == "__main__":
281
  demo.launch(debug=True)
 
1
+ mport gradio as gr
2
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
3
  from transformers.image_utils import load_image
4
  from threading import Thread
 
189
  audio_path = tts(buffer)
190
  return audio_path # Return the audio file path
191
 
192
+ # Option 1: Use regular Interface with streaming (recommended)
193
  with gr.Blocks() as demo:
194
  gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
195
 
196
+ chatbot = gr.Chatbot()
197
+ msg = gr.MultimodalTextbox(
198
+ label="Query Input",
199
+ file_types=["image", "video"],
200
+ file_count="multiple"
201
+ )
202
+ audio_output = gr.Audio(label="Generated Speech")
203
+ clear = gr.Button("Clear")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  def respond(message, chat_history):
 
 
 
206
  # Add user message to chat history
207
  bot_message = ""
208
  chat_history.append([message["text"], ""])
 
211
  for response in model_inference(message, chat_history):
212
  bot_message = response
213
  chat_history[-1][1] = bot_message
214
+ yield "", chat_history, None
215
 
216
  # Generate audio after streaming is complete
217
  try:
218
  if bot_message.strip(): # Only generate TTS if there's actual text
219
  audio_path = tts(bot_message)
220
  if audio_path:
221
+ yield "", chat_history, audio_path
222
  else:
223
  print("TTS returned None or empty result")
224
+ yield "", chat_history, None
225
  else:
226
+ yield "", chat_history, None
227
  except Exception as e:
228
  print(f"TTS Error: {e}")
229
+ yield "", chat_history, None
230
 
231
+ msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
232
+ clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
233
+
234
+ # Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
235
+ # demo = gr.ChatInterface(
236
+ # fn=model_inference,
237
+ # description="# oolel-vision-experimental `@video-infer for video understanding`**",
238
+ # fill_height=True,
239
+ # textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
240
+ # stop_btn="Stop Generation",
241
+ # multimodal=True,
242
+ # cache_examples=False,
243
+ # )
244
 
245
  if __name__ == "__main__":
246
  demo.launch(debug=True)