anuj-exe commited on
Commit
b1888cb
·
verified ·
1 Parent(s): 84f8bc9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -8
app.py CHANGED
@@ -9,14 +9,28 @@ import numpy as np
9
 
10
  app = FastAPI(title="SpeechT5 TTS API")
11
 
 
12
  NORMALIZATION_LEVEL = 0.1
13
  SMOOTHING_WINDOW = 3
14
  BIT_DEPTH = "32f"
15
 
 
16
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
17
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def load_speaker_embedding(url: str) -> torch.Tensor:
22
  response = requests.get(url)
@@ -25,11 +39,6 @@ def load_speaker_embedding(url: str) -> torch.Tensor:
25
  return embedding.unsqueeze(0)
26
 
27
 
28
- speaker_embeddings = load_speaker_embedding(
29
- "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
30
- )
31
-
32
-
33
  def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
34
  if window_size < 2:
35
  return audio
@@ -42,23 +51,36 @@ def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
42
 
43
 
44
  @app.get("/speak")
45
- def speak(text: str = Query(..., description="Text to convert to speech")):
 
 
 
 
 
 
 
 
46
  inputs = processor(text=text, return_tensors="pt")
47
 
48
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 
49
  audio = speech.numpy().astype(np.float32)
50
 
 
51
  audio = smooth_audio(audio, SMOOTHING_WINDOW)
52
 
 
53
  peak = np.max(np.abs(audio))
54
  if peak > 0:
55
  audio = (audio / peak) * NORMALIZATION_LEVEL
56
 
 
57
  if BIT_DEPTH == "16":
58
  pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
59
  else:
60
- pcm = audio
61
 
 
62
  buf = io.BytesIO()
63
  subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
64
  sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)
 
9
 
10
  app = FastAPI(title="SpeechT5 TTS API")
11
 
12
+ # Adjustable parameters
13
  NORMALIZATION_LEVEL = 0.1
14
  SMOOTHING_WINDOW = 3
15
  BIT_DEPTH = "32f"
16
 
17
+ # Load models once at startup
18
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
19
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
21
 
22
+ # Map integer to speaker embedding URL
23
+ SPEAKER_EMBEDDINGS = {
24
+ 0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin", # Normal
25
+ 1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin", # US female 1
26
+ 2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin", # US female 2
27
+ 3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin", # US male 1
28
+ 4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin", # US male 2
29
+ 5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin", # Canadian male
30
+ 6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin", # Scottish male
31
+ 7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin", # Indian male
32
+ }
33
+
34
 
35
  def load_speaker_embedding(url: str) -> torch.Tensor:
36
  response = requests.get(url)
 
39
  return embedding.unsqueeze(0)
40
 
41
 
 
 
 
 
 
42
  def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
43
  if window_size < 2:
44
  return audio
 
51
 
52
 
53
  @app.get("/speak")
54
+ def speak(
55
+ text: str = Query(..., description="Text to convert to speech"),
56
+ speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)")
57
+ ):
58
+ # Load the selected speaker embedding
59
+ embedding_url = SPEAKER_EMBEDDINGS[speaker]
60
+ speaker_embedding = load_speaker_embedding(embedding_url)
61
+
62
+ # Prepare input
63
  inputs = processor(text=text, return_tensors="pt")
64
 
65
+ # Generate speech
66
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
67
  audio = speech.numpy().astype(np.float32)
68
 
69
+ # Smooth audio
70
  audio = smooth_audio(audio, SMOOTHING_WINDOW)
71
 
72
+ # Normalize after smoothing
73
  peak = np.max(np.abs(audio))
74
  if peak > 0:
75
  audio = (audio / peak) * NORMALIZATION_LEVEL
76
 
77
+ # Convert to bit depth
78
  if BIT_DEPTH == "16":
79
  pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
80
  else:
81
+ pcm = audio # float32
82
 
83
+ # Write WAV
84
  buf = io.BytesIO()
85
  subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
86
  sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)