tchvc

Runtime error

App Files Files Community

yaya-sy commited on Sep 2

Commit

f6e8ca8

verified ·

1 Parent(s): b34bbf6

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -63

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import gradio as gr
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
@@ -189,66 +189,20 @@ def model_inference(input_dict, history):
     audio_path = tts(buffer)
     return audio_path  # Return the audio file path
-# Main interface with image preview
 with gr.Blocks() as demo:
     gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
-    with gr.Row():
-        with gr.Column(scale=2):
-            chatbot = gr.Chatbot(type="messages")
-            msg = gr.MultimodalTextbox(
-                label="Query Input",
-                file_types=["image", "video"],
-                file_count="multiple"
-            )
-            clear = gr.Button("Clear")
-        with gr.Column(scale=1):
-            uploaded_files = gr.Gallery(
-                label="Uploaded Files",
-                show_label=True,
-                elem_id="gallery",
-                columns=2,
-                rows=2,
-                object_fit="contain",
-                height="auto"
-            )
-            audio_output = gr.Audio(label="Generated Speech")
-    def update_gallery(message):
-        """Update gallery with uploaded files"""
-        if message and "files" in message and message["files"]:
-            # Filter for image files only (videos won't display properly in gallery)
-            image_files = []
-            for file_path in message["files"]:
-                try:
-                    # Check if it's an image by trying to open it
-                    with Image.open(file_path) as img:
-                        image_files.append(file_path)
-                except:
-                    # If it fails, it's probably a video or other file type
-                    # Generate video thumbnail for videos
-                    try:
-                        vidcap = cv2.VideoCapture(file_path)
-                        success, frame = vidcap.read()
-                        if success:
-                            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                            thumbnail = Image.fromarray(frame)
-                            # Save thumbnail temporarily
-                            import tempfile
-                            temp_thumb = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
-                            thumbnail.save(temp_thumb.name)
-                            image_files.append(temp_thumb.name)
-                        vidcap.release()
-                    except:
-                        pass
-            return image_files
-        return []
     def respond(message, chat_history):
-        # Update gallery first
-        gallery_files = update_gallery(message)
         # Add user message to chat history
         bot_message = ""
         chat_history.append([message["text"], ""])
@@ -257,25 +211,36 @@ with gr.Blocks() as demo:
         for response in model_inference(message, chat_history):
             bot_message = response
             chat_history[-1][1] = bot_message
-            yield "", chat_history, None, gallery_files
         # Generate audio after streaming is complete
         try:
             if bot_message.strip():  # Only generate TTS if there's actual text
                 audio_path = tts(bot_message)
                 if audio_path:
-                    yield "", chat_history, audio_path, gallery_files
                 else:
                     print("TTS returned None or empty result")
-                    yield "", chat_history, None, gallery_files
             else:
-                yield "", chat_history, None, gallery_files
         except Exception as e:
             print(f"TTS Error: {e}")
-            yield "", chat_history, None, gallery_files
-    msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output, uploaded_files])
-    clear.click(lambda: ([], None, []), outputs=[chatbot, audio_output, uploaded_files])
 if __name__ == "__main__":
     demo.launch(debug=True)

+mport gradio as gr
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
     audio_path = tts(buffer)
     return audio_path  # Return the audio file path
+# Option 1: Use regular Interface with streaming (recommended)
 with gr.Blocks() as demo:
     gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
+    chatbot = gr.Chatbot()
+    msg = gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "video"],
+        file_count="multiple"
+    )
+    audio_output = gr.Audio(label="Generated Speech")
+    clear = gr.Button("Clear")
     def respond(message, chat_history):
         # Add user message to chat history
         bot_message = ""
         chat_history.append([message["text"], ""])
         for response in model_inference(message, chat_history):
             bot_message = response
             chat_history[-1][1] = bot_message
+            yield "", chat_history, None
         # Generate audio after streaming is complete
         try:
             if bot_message.strip():  # Only generate TTS if there's actual text
                 audio_path = tts(bot_message)
                 if audio_path:
+                    yield "", chat_history, audio_path
                 else:
                     print("TTS returned None or empty result")
+                    yield "", chat_history, None
             else:
+                yield "", chat_history, None
         except Exception as e:
             print(f"TTS Error: {e}")
+            yield "", chat_history, None
+    msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
+    clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
+# Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
+# demo = gr.ChatInterface(
+#     fn=model_inference,
+#     description="# oolel-vision-experimental `@video-infer for video understanding`**",
+#     fill_height=True,
+#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
+#     stop_btn="Stop Generation",
+#     multimodal=True,
+#     cache_examples=False,
+# )
 if __name__ == "__main__":
     demo.launch(debug=True)