tchvc

Runtime error

App Files Files Community

yaya-sy commited on Sep 2

Commit

1fb2dc5

verified ·

1 Parent(s): f55c7e6

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -2

app.py CHANGED Viewed

@@ -9,6 +9,34 @@ import cv2
 import numpy as np
 from PIL import Image
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
@@ -108,9 +136,8 @@ def model_inference(input_dict, history):
         buffer = ""
         for new_text in streamer:
             buffer += new_text
-            time.sleep(0.01)
             yield buffer
-        return
     if len(files) > 1:
         images = [load_image(image) for image in files]
@@ -152,11 +179,14 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
 demo = gr.ChatInterface(
     fn=model_inference,
     description="# oolel-vision-experimental `@video-infer for video understanding`**",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,

 import numpy as np
 from PIL import Image
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+import soundfile as sf
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+tts_model = ParlerTTSForConditionalGeneration.from_pretrained("CONCREE/Adia_TTS").to(device)
+tts_tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS")
+@spaces.GPU
+def tts(text):
+    output_wav_path = tempfile.mktemp(suffix=".wav")
+    # Description du style vocal
+    description = "A clear and educational voice, with a flow adapted to learning"
+    # Génération
+    input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
+    prompt_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
+    audio = tts_model.generate(
+        input_ids=input_ids,
+        prompt_input_ids=prompt_ids,
+    )
+    sf.write(output_wav_path, audio.cpu().numpy().squeeze(), model.config.sampling_rate)
+    return output_wav_path
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
         buffer = ""
         for new_text in streamer:
             buffer += new_text
+            time.sleep(0.001)
             yield buffer
     if len(files) > 1:
         images = [load_image(image) for image in files]
         time.sleep(0.01)
         yield buffer
+    return tts("Munul")
 demo = gr.ChatInterface(
     fn=model_inference,
     description="# oolel-vision-experimental `@video-infer for video understanding`**",
     fill_height=True,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
+    outputs=gr.Audio(label="Generated Speech")
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,