yaya-sy commited on
Commit
9e290b2
Β·
verified Β·
1 Parent(s): 3579b31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -22
app.py CHANGED
@@ -8,6 +8,7 @@ import spaces
8
  import cv2
9
  import numpy as np
10
  from PIL import Image
 
11
 
12
  from parler_tts import ParlerTTSForConditionalGeneration
13
  from transformers import AutoTokenizer
@@ -22,7 +23,7 @@ tts_tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS")
22
  def tts(text):
23
  output_wav_path = tempfile.mktemp(suffix=".wav")
24
  # Description du style vocal
25
- description = "A clear and educational voice, with a flow adaptedΒ toΒ learning"
26
 
27
  # GΓ©nΓ©ration
28
  input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
@@ -32,7 +33,7 @@ def tts(text):
32
  input_ids=input_ids,
33
  prompt_input_ids=prompt_ids,
34
  )
35
- sf.write(output_wav_path, audio.cpu().numpy().squeeze(), model.config.sampling_rate)
36
 
37
  return output_wav_path
38
 
@@ -99,14 +100,12 @@ def model_inference(input_dict, history):
99
  # Remove the tag from the query.
100
  text = text[len("@video-infer"):].strip()
101
  if not files:
102
- gr.Error("Please upload a video file along with your @video-infer query.")
103
- return
104
  # Assume the first file is a video.
105
  video_path = files[0]
106
  frames = downsample_video(video_path)
107
  if not frames:
108
- gr.Error("Could not process video.")
109
- return
110
  # Build messages: start with the text prompt.
111
  messages = [
112
  {
@@ -138,6 +137,7 @@ def model_inference(input_dict, history):
138
  buffer += new_text
139
  time.sleep(0.001)
140
  yield buffer
 
141
 
142
  if len(files) > 1:
143
  images = [load_image(image) for image in files]
@@ -147,11 +147,9 @@ def model_inference(input_dict, history):
147
  images = []
148
 
149
  if text == "" and not images:
150
- gr.Error("Please input a query and optionally image(s).")
151
- return
152
  if text == "" and images:
153
- gr.Error("Please input a text query along with the image(s).")
154
- return
155
 
156
  messages = [
157
  {
@@ -179,17 +177,65 @@ def model_inference(input_dict, history):
179
  time.sleep(0.01)
180
  yield buffer
181
 
182
- return tts("Munul")
 
 
 
183
 
184
- demo = gr.ChatInterface(
185
- fn=model_inference,
186
- description="# oolel-vision-experimental `@video-infer for video understanding`**",
187
- fill_height=True,
188
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
189
- outputs=gr.Audio(label="Generated Speech"),
190
- stop_btn="Stop Generation",
191
- multimodal=True,
192
- cache_examples=False,
193
- )
 
 
 
194
 
195
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import cv2
9
  import numpy as np
10
  from PIL import Image
11
+ import tempfile
12
 
13
  from parler_tts import ParlerTTSForConditionalGeneration
14
  from transformers import AutoTokenizer
 
23
  def tts(text):
24
  output_wav_path = tempfile.mktemp(suffix=".wav")
25
  # Description du style vocal
26
+ description = "A clear and educational voice, with a flow adapted to learning"
27
 
28
  # GΓ©nΓ©ration
29
  input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
 
33
  input_ids=input_ids,
34
  prompt_input_ids=prompt_ids,
35
  )
36
+ sf.write(output_wav_path, audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate) # Fixed: was 'model.config'
37
 
38
  return output_wav_path
39
 
 
100
  # Remove the tag from the query.
101
  text = text[len("@video-infer"):].strip()
102
  if not files:
103
+ raise gr.Error("Please upload a video file along with your @video-infer query.") # Fixed: gr.Error syntax
 
104
  # Assume the first file is a video.
105
  video_path = files[0]
106
  frames = downsample_video(video_path)
107
  if not frames:
108
+ raise gr.Error("Could not process video.") # Fixed: gr.Error syntax
 
109
  # Build messages: start with the text prompt.
110
  messages = [
111
  {
 
137
  buffer += new_text
138
  time.sleep(0.001)
139
  yield buffer
140
+ return # Fixed: Added return to prevent falling through
141
 
142
  if len(files) > 1:
143
  images = [load_image(image) for image in files]
 
147
  images = []
148
 
149
  if text == "" and not images:
150
+ raise gr.Error("Please input a query and optionally image(s).") # Fixed: gr.Error syntax
 
151
  if text == "" and images:
152
+ raise gr.Error("Please input a text query along with the image(s).") # Fixed: gr.Error syntax
 
153
 
154
  messages = [
155
  {
 
177
  time.sleep(0.01)
178
  yield buffer
179
 
180
+ # This will only be reached after streaming is complete
181
+ # Generate TTS for the final buffer content
182
+ audio_path = tts(buffer)
183
+ return audio_path # Return the audio file path
184
 
185
+ # Alternative approach: Use regular Interface instead of ChatInterface
186
+ def combined_inference(input_dict, history):
187
+ """Modified function that returns both text and audio"""
188
+ text_response = ""
189
+
190
+ # Get the streaming response
191
+ for response in model_inference(input_dict, history):
192
+ text_response = response
193
+
194
+ # Generate audio from final text
195
+ audio_path = tts(text_response)
196
+
197
+ return text_response, audio_path
198
 
199
+ # Option 1: Use regular Interface (recommended)
200
+ with gr.Blocks() as demo:
201
+ gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
202
+
203
+ chatbot = gr.Chatbot()
204
+ msg = gr.MultimodalTextbox(
205
+ label="Query Input",
206
+ file_types=["image", "video"],
207
+ file_count="multiple"
208
+ )
209
+ audio_output = gr.Audio(label="Generated Speech")
210
+ clear = gr.Button("Clear")
211
+
212
+ def respond(message, chat_history):
213
+ # Get text response through streaming
214
+ text_response = ""
215
+ for response in model_inference(message, chat_history):
216
+ text_response = response
217
+
218
+ # Add to chat history
219
+ chat_history.append([message["text"], text_response])
220
+
221
+ # Generate audio
222
+ audio_path = tts(text_response)
223
+
224
+ return "", chat_history, audio_path
225
+
226
+ msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
227
+ clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
228
+
229
+ # Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
230
+ # demo = gr.ChatInterface(
231
+ # fn=model_inference,
232
+ # description="# oolel-vision-experimental `@video-infer for video understanding`**",
233
+ # fill_height=True,
234
+ # textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
235
+ # stop_btn="Stop Generation",
236
+ # multimodal=True,
237
+ # cache_examples=False,
238
+ # )
239
+
240
+ if __name__ == "__main__":
241
+ demo.launch(debug=True)