yaya-sy commited on
Commit
1fb2dc5
Β·
verified Β·
1 Parent(s): f55c7e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -2
app.py CHANGED
@@ -9,6 +9,34 @@ import cv2
9
  import numpy as np
10
  from PIL import Image
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def progress_bar_html(label: str) -> str:
13
  """
14
  Returns an HTML snippet for a thin progress bar with a label.
@@ -108,9 +136,8 @@ def model_inference(input_dict, history):
108
  buffer = ""
109
  for new_text in streamer:
110
  buffer += new_text
111
- time.sleep(0.01)
112
  yield buffer
113
- return
114
 
115
  if len(files) > 1:
116
  images = [load_image(image) for image in files]
@@ -152,11 +179,14 @@ def model_inference(input_dict, history):
152
  time.sleep(0.01)
153
  yield buffer
154
 
 
 
155
  demo = gr.ChatInterface(
156
  fn=model_inference,
157
  description="# oolel-vision-experimental `@video-infer for video understanding`**",
158
  fill_height=True,
159
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
 
160
  stop_btn="Stop Generation",
161
  multimodal=True,
162
  cache_examples=False,
 
9
  import numpy as np
10
  from PIL import Image
11
 
12
+ from parler_tts import ParlerTTSForConditionalGeneration
13
+ from transformers import AutoTokenizer
14
+ import soundfile as sf
15
+
16
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
+
18
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("CONCREE/Adia_TTS").to(device)
19
+ tts_tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS")
20
+
21
+ @spaces.GPU
22
+ def tts(text):
23
+ output_wav_path = tempfile.mktemp(suffix=".wav")
24
+ # Description du style vocal
25
+ description = "A clear and educational voice, with a flow adaptedΒ toΒ learning"
26
+
27
+ # GΓ©nΓ©ration
28
+ input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
29
+ prompt_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
30
+
31
+ audio = tts_model.generate(
32
+ input_ids=input_ids,
33
+ prompt_input_ids=prompt_ids,
34
+ )
35
+ sf.write(output_wav_path, audio.cpu().numpy().squeeze(), model.config.sampling_rate)
36
+
37
+ return output_wav_path
38
+
39
+
40
  def progress_bar_html(label: str) -> str:
41
  """
42
  Returns an HTML snippet for a thin progress bar with a label.
 
136
  buffer = ""
137
  for new_text in streamer:
138
  buffer += new_text
139
+ time.sleep(0.001)
140
  yield buffer
 
141
 
142
  if len(files) > 1:
143
  images = [load_image(image) for image in files]
 
179
  time.sleep(0.01)
180
  yield buffer
181
 
182
+ return tts("Munul")
183
+
184
  demo = gr.ChatInterface(
185
  fn=model_inference,
186
  description="# oolel-vision-experimental `@video-infer for video understanding`**",
187
  fill_height=True,
188
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
189
+ outputs=gr.Audio(label="Generated Speech")
190
  stop_btn="Stop Generation",
191
  multimodal=True,
192
  cache_examples=False,