tchvc

Runtime error

App Files Files Community

yaya-sy commited on Sep 2

Commit

9e290b2

verified ·

1 Parent(s): 3579b31

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -22

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import spaces
 import cv2
 import numpy as np
 from PIL import Image
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer
@@ -22,7 +23,7 @@ tts_tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS")
 def tts(text):
     output_wav_path = tempfile.mktemp(suffix=".wav")
     # Description du style vocal
-    description = "A clear and educational voice, with a flow adapted to learning"
     # Génération
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
@@ -32,7 +33,7 @@ def tts(text):
         input_ids=input_ids,
         prompt_input_ids=prompt_ids,
     )
-    sf.write(output_wav_path, audio.cpu().numpy().squeeze(), model.config.sampling_rate)
     return output_wav_path
@@ -99,14 +100,12 @@ def model_inference(input_dict, history):
         # Remove the tag from the query.
         text = text[len("@video-infer"):].strip()
         if not files:
-            gr.Error("Please upload a video file along with your @video-infer query.")
-            return
         # Assume the first file is a video.
         video_path = files[0]
         frames = downsample_video(video_path)
         if not frames:
-            gr.Error("Could not process video.")
-            return
         # Build messages: start with the text prompt.
         messages = [
             {
@@ -138,6 +137,7 @@ def model_inference(input_dict, history):
             buffer += new_text
             time.sleep(0.001)
             yield buffer
     if len(files) > 1:
         images = [load_image(image) for image in files]
@@ -147,11 +147,9 @@ def model_inference(input_dict, history):
         images = []
     if text == "" and not images:
-        gr.Error("Please input a query and optionally image(s).")
-        return
     if text == "" and images:
-        gr.Error("Please input a text query along with the image(s).")
-        return
     messages = [
         {
@@ -179,17 +177,65 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
-    return tts("Munul")
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description="# oolel-vision-experimental `@video-infer for video understanding`**",
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
-    outputs=gr.Audio(label="Generated Speech"),
-    stop_btn="Stop Generation",
-    multimodal=True,
-    cache_examples=False,
-)
-demo.launch(debug=True)

 import cv2
 import numpy as np
 from PIL import Image
+import tempfile
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer
 def tts(text):
     output_wav_path = tempfile.mktemp(suffix=".wav")
     # Description du style vocal
+    description = "A clear and educational voice, with a flow adapted to learning"
     # Génération
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
         input_ids=input_ids,
         prompt_input_ids=prompt_ids,
     )
+    sf.write(output_wav_path, audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)  # Fixed: was 'model.config'
     return output_wav_path
         # Remove the tag from the query.
         text = text[len("@video-infer"):].strip()
         if not files:
+            raise gr.Error("Please upload a video file along with your @video-infer query.")  # Fixed: gr.Error syntax
         # Assume the first file is a video.
         video_path = files[0]
         frames = downsample_video(video_path)
         if not frames:
+            raise gr.Error("Could not process video.")  # Fixed: gr.Error syntax
         # Build messages: start with the text prompt.
         messages = [
             {
             buffer += new_text
             time.sleep(0.001)
             yield buffer
+        return  # Fixed: Added return to prevent falling through
     if len(files) > 1:
         images = [load_image(image) for image in files]
         images = []
     if text == "" and not images:
+        raise gr.Error("Please input a query and optionally image(s).")  # Fixed: gr.Error syntax
     if text == "" and images:
+        raise gr.Error("Please input a text query along with the image(s).")  # Fixed: gr.Error syntax
     messages = [
         {
         time.sleep(0.01)
         yield buffer
+    # This will only be reached after streaming is complete
+    # Generate TTS for the final buffer content
+    audio_path = tts(buffer)
+    return audio_path  # Return the audio file path
+# Alternative approach: Use regular Interface instead of ChatInterface
+def combined_inference(input_dict, history):
+    """Modified function that returns both text and audio"""
+    text_response = ""
+    # Get the streaming response
+    for response in model_inference(input_dict, history):
+        text_response = response
+    # Generate audio from final text
+    audio_path = tts(text_response)
+    return text_response, audio_path
+# Option 1: Use regular Interface (recommended)
+with gr.Blocks() as demo:
+    gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
+    chatbot = gr.Chatbot()
+    msg = gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "video"],
+        file_count="multiple"
+    )
+    audio_output = gr.Audio(label="Generated Speech")
+    clear = gr.Button("Clear")
+    def respond(message, chat_history):
+        # Get text response through streaming
+        text_response = ""
+        for response in model_inference(message, chat_history):
+            text_response = response
+        # Add to chat history
+        chat_history.append([message["text"], text_response])
+        # Generate audio
+        audio_path = tts(text_response)
+        return "", chat_history, audio_path
+    msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
+    clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
+# Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
+# demo = gr.ChatInterface(
+#     fn=model_inference,
+#     description="# oolel-vision-experimental `@video-infer for video understanding`**",
+#     fill_height=True,
+#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
+#     stop_btn="Stop Generation",
+#     multimodal=True,
+#     cache_examples=False,
+# )
+if __name__ == "__main__":
+    demo.launch(debug=True)