anuj-exe commited on
Commit
c567be8
·
verified ·
1 Parent(s): a2d5f3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -29
app.py CHANGED
@@ -4,61 +4,54 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
4
  import torch
5
  import io
6
  import os
7
- import logging
8
  import numpy as np
9
- from pydub import AudioSegment
10
  import soundfile as sf
 
 
11
 
12
  logging.basicConfig(level=logging.INFO)
13
 
 
14
  os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
15
 
16
- app = FastAPI()
17
 
18
- # Load model & processor
19
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
20
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
21
 
22
- # Dummy speaker embedding
23
- speaker_embeddings = torch.zeros((1, 512))
24
 
25
  @app.get("/")
26
  def home():
27
- return {"message": "Welcome to SpeechT5 TTS API. Use /speak_wav or /speak_mp3"}
28
-
29
- def float32_to_int16(waveform: np.ndarray):
30
- """Convert float32 [-1,1] waveform to int16 PCM"""
31
- waveform = np.clip(waveform, -1.0, 1.0)
32
- waveform_int16 = (waveform * 32767).astype(np.int16)
33
- return waveform_int16
34
 
35
- @app.get("/speak_wav")
36
- def speak_wav(text: str):
 
37
  inputs = processor(text=text, return_tensors="pt")
38
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
39
-
40
- buf = io.BytesIO()
41
- waveform_int16 = float32_to_int16(speech.numpy())
42
- sf.write(buf, waveform_int16, 16000, format="WAV")
43
- buf.seek(0)
44
- return StreamingResponse(buf, media_type="audio/wav")
45
 
46
- @app.get("/speak_mp3")
47
- def speak_mp3(text: str):
48
- inputs = processor(text=text, return_tensors="pt")
49
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
50
 
51
- # Convert float32 -> int16
52
- waveform_int16 = float32_to_int16(speech.numpy())
53
 
54
- # WAV buffer
 
 
 
 
55
  wav_buf = io.BytesIO()
56
  sf.write(wav_buf, waveform_int16, 16000, format="WAV")
57
  wav_buf.seek(0)
58
 
59
- # Convert to MP3
60
  audio = AudioSegment.from_wav(wav_buf)
61
  mp3_buf = io.BytesIO()
62
  audio.export(mp3_buf, format="mp3")
63
  mp3_buf.seek(0)
 
 
64
  return StreamingResponse(mp3_buf, media_type="audio/mpeg")
 
4
  import torch
5
  import io
6
  import os
 
7
  import numpy as np
 
8
  import soundfile as sf
9
+ from pydub import AudioSegment
10
+ import logging
11
 
12
  logging.basicConfig(level=logging.INFO)
13
 
14
+ # Use a writable cache directory in Spaces
15
  os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
16
 
17
+ app = FastAPI(title="SpeechT5 TTS API")
18
 
19
+ # Load processor and model
20
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
21
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
22
 
23
+ # Dummy speaker embedding (flat, mono)
24
+ speaker_embeddings = torch.zeros(1, 512) # shape (1, 512)
25
 
26
  @app.get("/")
27
  def home():
28
+ return {"message": "Welcome to SpeechT5 TTS API. Use /speak?text=Hello"}
 
 
 
 
 
 
29
 
30
+ @app.get("/speak")
31
+ def speak(text: str):
32
+ # 1️⃣ Tokenize input text
33
  inputs = processor(text=text, return_tensors="pt")
 
 
 
 
 
 
 
34
 
35
+ # 2️⃣ Generate speech waveform
 
 
36
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
37
 
38
+ # 3️⃣ Ensure mono audio
39
+ speech = speech.squeeze() # remove extra dimensions if any
40
 
41
+ # 4️⃣ Convert float32 waveform [-1,1] -> int16 PCM
42
+ waveform_int16 = np.clip(speech.numpy(), -1.0, 1.0) * 32767
43
+ waveform_int16 = waveform_int16.astype(np.int16)
44
+
45
+ # 5️⃣ Write to WAV buffer
46
  wav_buf = io.BytesIO()
47
  sf.write(wav_buf, waveform_int16, 16000, format="WAV")
48
  wav_buf.seek(0)
49
 
50
+ # 6️⃣ Convert WAV -> MP3 for browser-friendly playback
51
  audio = AudioSegment.from_wav(wav_buf)
52
  mp3_buf = io.BytesIO()
53
  audio.export(mp3_buf, format="mp3")
54
  mp3_buf.seek(0)
55
+
56
+ # 7️⃣ Return MP3 audio
57
  return StreamingResponse(mp3_buf, media_type="audio/mpeg")