Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import spaces
|
|
| 8 |
import cv2
|
| 9 |
import numpy as np
|
| 10 |
from PIL import Image
|
|
|
|
| 11 |
|
| 12 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 13 |
from transformers import AutoTokenizer
|
|
@@ -22,7 +23,7 @@ tts_tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS")
|
|
| 22 |
def tts(text):
|
| 23 |
output_wav_path = tempfile.mktemp(suffix=".wav")
|
| 24 |
# Description du style vocal
|
| 25 |
-
description = "A clear and educational voice, with a flow adapted
|
| 26 |
|
| 27 |
# GΓ©nΓ©ration
|
| 28 |
input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
|
|
@@ -32,7 +33,7 @@ def tts(text):
|
|
| 32 |
input_ids=input_ids,
|
| 33 |
prompt_input_ids=prompt_ids,
|
| 34 |
)
|
| 35 |
-
sf.write(output_wav_path, audio.cpu().numpy().squeeze(),
|
| 36 |
|
| 37 |
return output_wav_path
|
| 38 |
|
|
@@ -99,14 +100,12 @@ def model_inference(input_dict, history):
|
|
| 99 |
# Remove the tag from the query.
|
| 100 |
text = text[len("@video-infer"):].strip()
|
| 101 |
if not files:
|
| 102 |
-
gr.Error("Please upload a video file along with your @video-infer query.")
|
| 103 |
-
return
|
| 104 |
# Assume the first file is a video.
|
| 105 |
video_path = files[0]
|
| 106 |
frames = downsample_video(video_path)
|
| 107 |
if not frames:
|
| 108 |
-
gr.Error("Could not process video.")
|
| 109 |
-
return
|
| 110 |
# Build messages: start with the text prompt.
|
| 111 |
messages = [
|
| 112 |
{
|
|
@@ -138,6 +137,7 @@ def model_inference(input_dict, history):
|
|
| 138 |
buffer += new_text
|
| 139 |
time.sleep(0.001)
|
| 140 |
yield buffer
|
|
|
|
| 141 |
|
| 142 |
if len(files) > 1:
|
| 143 |
images = [load_image(image) for image in files]
|
|
@@ -147,11 +147,9 @@ def model_inference(input_dict, history):
|
|
| 147 |
images = []
|
| 148 |
|
| 149 |
if text == "" and not images:
|
| 150 |
-
gr.Error("Please input a query and optionally image(s).")
|
| 151 |
-
return
|
| 152 |
if text == "" and images:
|
| 153 |
-
gr.Error("Please input a text query along with the image(s).")
|
| 154 |
-
return
|
| 155 |
|
| 156 |
messages = [
|
| 157 |
{
|
|
@@ -179,17 +177,65 @@ def model_inference(input_dict, history):
|
|
| 179 |
time.sleep(0.01)
|
| 180 |
yield buffer
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import cv2
|
| 9 |
import numpy as np
|
| 10 |
from PIL import Image
|
| 11 |
+
import tempfile
|
| 12 |
|
| 13 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 14 |
from transformers import AutoTokenizer
|
|
|
|
| 23 |
def tts(text):
|
| 24 |
output_wav_path = tempfile.mktemp(suffix=".wav")
|
| 25 |
# Description du style vocal
|
| 26 |
+
description = "A clear and educational voice, with a flow adapted to learning"
|
| 27 |
|
| 28 |
# GΓ©nΓ©ration
|
| 29 |
input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
|
|
|
|
| 33 |
input_ids=input_ids,
|
| 34 |
prompt_input_ids=prompt_ids,
|
| 35 |
)
|
| 36 |
+
sf.write(output_wav_path, audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate) # Fixed: was 'model.config'
|
| 37 |
|
| 38 |
return output_wav_path
|
| 39 |
|
|
|
|
| 100 |
# Remove the tag from the query.
|
| 101 |
text = text[len("@video-infer"):].strip()
|
| 102 |
if not files:
|
| 103 |
+
raise gr.Error("Please upload a video file along with your @video-infer query.") # Fixed: gr.Error syntax
|
|
|
|
| 104 |
# Assume the first file is a video.
|
| 105 |
video_path = files[0]
|
| 106 |
frames = downsample_video(video_path)
|
| 107 |
if not frames:
|
| 108 |
+
raise gr.Error("Could not process video.") # Fixed: gr.Error syntax
|
|
|
|
| 109 |
# Build messages: start with the text prompt.
|
| 110 |
messages = [
|
| 111 |
{
|
|
|
|
| 137 |
buffer += new_text
|
| 138 |
time.sleep(0.001)
|
| 139 |
yield buffer
|
| 140 |
+
return # Fixed: Added return to prevent falling through
|
| 141 |
|
| 142 |
if len(files) > 1:
|
| 143 |
images = [load_image(image) for image in files]
|
|
|
|
| 147 |
images = []
|
| 148 |
|
| 149 |
if text == "" and not images:
|
| 150 |
+
raise gr.Error("Please input a query and optionally image(s).") # Fixed: gr.Error syntax
|
|
|
|
| 151 |
if text == "" and images:
|
| 152 |
+
raise gr.Error("Please input a text query along with the image(s).") # Fixed: gr.Error syntax
|
|
|
|
| 153 |
|
| 154 |
messages = [
|
| 155 |
{
|
|
|
|
| 177 |
time.sleep(0.01)
|
| 178 |
yield buffer
|
| 179 |
|
| 180 |
+
# This will only be reached after streaming is complete
|
| 181 |
+
# Generate TTS for the final buffer content
|
| 182 |
+
audio_path = tts(buffer)
|
| 183 |
+
return audio_path # Return the audio file path
|
| 184 |
|
| 185 |
+
# Alternative approach: Use regular Interface instead of ChatInterface
|
| 186 |
+
def combined_inference(input_dict, history):
|
| 187 |
+
"""Modified function that returns both text and audio"""
|
| 188 |
+
text_response = ""
|
| 189 |
+
|
| 190 |
+
# Get the streaming response
|
| 191 |
+
for response in model_inference(input_dict, history):
|
| 192 |
+
text_response = response
|
| 193 |
+
|
| 194 |
+
# Generate audio from final text
|
| 195 |
+
audio_path = tts(text_response)
|
| 196 |
+
|
| 197 |
+
return text_response, audio_path
|
| 198 |
|
| 199 |
+
# Option 1: Use regular Interface (recommended)
|
| 200 |
+
with gr.Blocks() as demo:
|
| 201 |
+
gr.Markdown("# oolel-vision-experimental `@video-infer for video understanding`")
|
| 202 |
+
|
| 203 |
+
chatbot = gr.Chatbot()
|
| 204 |
+
msg = gr.MultimodalTextbox(
|
| 205 |
+
label="Query Input",
|
| 206 |
+
file_types=["image", "video"],
|
| 207 |
+
file_count="multiple"
|
| 208 |
+
)
|
| 209 |
+
audio_output = gr.Audio(label="Generated Speech")
|
| 210 |
+
clear = gr.Button("Clear")
|
| 211 |
+
|
| 212 |
+
def respond(message, chat_history):
|
| 213 |
+
# Get text response through streaming
|
| 214 |
+
text_response = ""
|
| 215 |
+
for response in model_inference(message, chat_history):
|
| 216 |
+
text_response = response
|
| 217 |
+
|
| 218 |
+
# Add to chat history
|
| 219 |
+
chat_history.append([message["text"], text_response])
|
| 220 |
+
|
| 221 |
+
# Generate audio
|
| 222 |
+
audio_path = tts(text_response)
|
| 223 |
+
|
| 224 |
+
return "", chat_history, audio_path
|
| 225 |
+
|
| 226 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
|
| 227 |
+
clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
|
| 228 |
+
|
| 229 |
+
# Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
|
| 230 |
+
# demo = gr.ChatInterface(
|
| 231 |
+
# fn=model_inference,
|
| 232 |
+
# description="# oolel-vision-experimental `@video-infer for video understanding`**",
|
| 233 |
+
# fill_height=True,
|
| 234 |
+
# textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|
| 235 |
+
# stop_btn="Stop Generation",
|
| 236 |
+
# multimodal=True,
|
| 237 |
+
# cache_examples=False,
|
| 238 |
+
# )
|
| 239 |
+
|
| 240 |
+
if __name__ == "__main__":
|
| 241 |
+
demo.launch(debug=True)
|