anuj-exe commited on
Commit
a24b1a1
·
verified ·
1 Parent(s): d744d3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -9
app.py CHANGED
@@ -2,11 +2,12 @@ from fastapi import FastAPI
2
  from fastapi.responses import StreamingResponse
3
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
4
  import torch
5
- import soundfile as sf
6
  import io
7
  import os
8
  import logging
 
9
  from pydub import AudioSegment
 
10
 
11
  logging.basicConfig(level=logging.INFO)
12
 
@@ -14,24 +15,31 @@ os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
14
 
15
  app = FastAPI()
16
 
17
- # Load processor & model
18
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
19
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
20
 
21
- # Dummy speaker embedding (flat voice)
22
  speaker_embeddings = torch.zeros((1, 512))
23
 
24
  @app.get("/")
25
  def home():
26
- return {"message": "Welcome to the SpeechT5 TTS API. Use /speak?text=Hello"}
 
 
 
 
 
 
27
 
28
  @app.get("/speak_wav")
29
  def speak_wav(text: str):
30
  inputs = processor(text=text, return_tensors="pt")
31
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
 
32
  buf = io.BytesIO()
33
- # PCM16 WAV
34
- sf.write(buf, speech.numpy(), 16000, format="WAV", subtype="PCM_16")
35
  buf.seek(0)
36
  return StreamingResponse(buf, media_type="audio/wav")
37
 
@@ -39,11 +47,16 @@ def speak_wav(text: str):
39
  def speak_mp3(text: str):
40
  inputs = processor(text=text, return_tensors="pt")
41
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
42
- # Convert to WAV buffer first
 
 
 
 
43
  wav_buf = io.BytesIO()
44
- sf.write(wav_buf, speech.numpy(), 16000, format="WAV", subtype="PCM_16")
45
  wav_buf.seek(0)
46
- # Convert WAV → MP3
 
47
  audio = AudioSegment.from_wav(wav_buf)
48
  mp3_buf = io.BytesIO()
49
  audio.export(mp3_buf, format="mp3")
 
2
  from fastapi.responses import StreamingResponse
3
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
4
  import torch
 
5
  import io
6
  import os
7
  import logging
8
+ import numpy as np
9
  from pydub import AudioSegment
10
+ import soundfile as sf
11
 
12
  logging.basicConfig(level=logging.INFO)
13
 
 
15
 
16
  app = FastAPI()
17
 
18
+ # Load model & processor
19
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
20
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
21
 
22
+ # Dummy speaker embedding
23
  speaker_embeddings = torch.zeros((1, 512))
24
 
25
  @app.get("/")
26
  def home():
27
+ return {"message": "Welcome to SpeechT5 TTS API. Use /speak_wav or /speak_mp3"}
28
+
29
+ def float32_to_int16(waveform: np.ndarray):
30
+ """Convert float32 [-1,1] waveform to int16 PCM"""
31
+ waveform = np.clip(waveform, -1.0, 1.0)
32
+ waveform_int16 = (waveform * 32767).astype(np.int16)
33
+ return waveform_int16
34
 
35
  @app.get("/speak_wav")
36
  def speak_wav(text: str):
37
  inputs = processor(text=text, return_tensors="pt")
38
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
39
+
40
  buf = io.BytesIO()
41
+ waveform_int16 = float32_to_int16(speech.numpy())
42
+ sf.write(buf, waveform_int16, 16000, format="WAV")
43
  buf.seek(0)
44
  return StreamingResponse(buf, media_type="audio/wav")
45
 
 
47
  def speak_mp3(text: str):
48
  inputs = processor(text=text, return_tensors="pt")
49
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
50
+
51
+ # Convert float32 -> int16
52
+ waveform_int16 = float32_to_int16(speech.numpy())
53
+
54
+ # WAV buffer
55
  wav_buf = io.BytesIO()
56
+ sf.write(wav_buf, waveform_int16, 16000, format="WAV")
57
  wav_buf.seek(0)
58
+
59
+ # Convert to MP3
60
  audio = AudioSegment.from_wav(wav_buf)
61
  mp3_buf = io.BytesIO()
62
  audio.export(mp3_buf, format="mp3")