Spaces:

anuj-exe
/

textSpeaker

Sleeping

App Files Files Community

anuj-exe commited on Sep 27

Commit

b1888cb

verified ·

1 Parent(s): 84f8bc9

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -8

app.py CHANGED Viewed

@@ -9,14 +9,28 @@ import numpy as np
 app = FastAPI(title="SpeechT5 TTS API")
 NORMALIZATION_LEVEL = 0.1
 SMOOTHING_WINDOW = 3
 BIT_DEPTH = "32f"
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 def load_speaker_embedding(url: str) -> torch.Tensor:
     response = requests.get(url)
@@ -25,11 +39,6 @@ def load_speaker_embedding(url: str) -> torch.Tensor:
     return embedding.unsqueeze(0)
-speaker_embeddings = load_speaker_embedding(
-    "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
-)
 def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
     if window_size < 2:
         return audio
@@ -42,23 +51,36 @@ def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
 @app.get("/speak")
-def speak(text: str = Query(..., description="Text to convert to speech")):
     inputs = processor(text=text, return_tensors="pt")
-    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     audio = speech.numpy().astype(np.float32)
     audio = smooth_audio(audio, SMOOTHING_WINDOW)
     peak = np.max(np.abs(audio))
     if peak > 0:
         audio = (audio / peak) * NORMALIZATION_LEVEL
     if BIT_DEPTH == "16":
         pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
     else:
-        pcm = audio
     buf = io.BytesIO()
     subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
     sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)

 app = FastAPI(title="SpeechT5 TTS API")
+# Adjustable parameters
 NORMALIZATION_LEVEL = 0.1
 SMOOTHING_WINDOW = 3
 BIT_DEPTH = "32f"
+# Load models once at startup
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# Map integer to speaker embedding URL
+SPEAKER_EMBEDDINGS = {
+    0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin",  # Normal
+    1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin",  # US female 1
+    2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin",  # US female 2
+    3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin",  # US male 1
+    4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin",  # US male 2
+    5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin",  # Canadian male
+    6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin",  # Scottish male
+    7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin",  # Indian male
+}
 def load_speaker_embedding(url: str) -> torch.Tensor:
     response = requests.get(url)
     return embedding.unsqueeze(0)
 def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
     if window_size < 2:
         return audio
 @app.get("/speak")
+def speak(
+    text: str = Query(..., description="Text to convert to speech"),
+    speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)")
+):
+    # Load the selected speaker embedding
+    embedding_url = SPEAKER_EMBEDDINGS[speaker]
+    speaker_embedding = load_speaker_embedding(embedding_url)
+    # Prepare input
     inputs = processor(text=text, return_tensors="pt")
+    # Generate speech
+    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
     audio = speech.numpy().astype(np.float32)
+    # Smooth audio
     audio = smooth_audio(audio, SMOOTHING_WINDOW)
+    # Normalize after smoothing
     peak = np.max(np.abs(audio))
     if peak > 0:
         audio = (audio / peak) * NORMALIZATION_LEVEL
+    # Convert to bit depth
     if BIT_DEPTH == "16":
         pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
     else:
+        pcm = audio  # float32
+    # Write WAV
     buf = io.BytesIO()
     subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
     sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)