anuj-exe commited on
Commit
e37d2df
·
verified ·
1 Parent(s): 0a85b62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -43
app.py CHANGED
@@ -1,55 +1,23 @@
1
  from fastapi import FastAPI
2
  from fastapi.responses import StreamingResponse
3
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
4
- import torch
5
  import io
6
  import soundfile as sf
7
- from pydub import AudioSegment
8
- import numpy as np
9
- import os
10
-
11
- os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
12
 
13
  app = FastAPI()
14
 
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
17
-
18
- # Preload a reference audio for speaker embedding
19
- # Download a small sample from HuggingFace or use your own
20
- ref_audio_file = "reference.wav" # must exist in your repo/files
21
-
22
- # Load reference audio and compute embedding
23
- import torchaudio
24
- speech_array, sr = torchaudio.load(ref_audio_file)
25
- speech_array = speech_array.mean(dim=0, keepdim=True) # mono
26
- speaker_embeddings = model.get_speaker_embedding(speech_array, sampling_rate=sr)
27
 
28
  @app.get("/speak")
29
  def speak(text: str):
30
- inputs = processor(text=text, return_tensors="pt")
31
-
32
- # Generate speech
33
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
34
-
35
- # Flatten to 1D
36
- speech = speech.squeeze().detach().cpu().numpy()
37
- if speech.ndim > 1:
38
- speech = speech.mean(axis=-1)
39
-
40
- # Convert to int16
41
- waveform_int16 = np.clip(speech, -1.0, 1.0) * 32767
42
- waveform_int16 = waveform_int16.astype(np.int16)
43
-
44
- # WAV buffer
45
- wav_buf = io.BytesIO()
46
- sf.write(wav_buf, waveform_int16, 16000, format="WAV", subtype="PCM_16")
47
- wav_buf.seek(0)
48
 
49
- # Convert to MP3
50
- audio = AudioSegment.from_wav(wav_buf)
51
- mp3_buf = io.BytesIO()
52
- audio.export(mp3_buf, format="mp3")
53
- mp3_buf.seek(0)
54
 
55
- return StreamingResponse(mp3_buf, media_type="audio/mpeg")
 
1
  from fastapi import FastAPI
2
  from fastapi.responses import StreamingResponse
3
+ from transformers import pipeline
 
4
  import io
5
  import soundfile as sf
 
 
 
 
 
6
 
7
  app = FastAPI()
8
 
9
+ # Load TTS pipeline
10
+ tts = pipeline("text-to-speech", model="suno/bark-small")
 
 
 
 
 
 
 
 
 
 
11
 
12
  @app.get("/speak")
13
  def speak(text: str):
14
+ # Generate speech (numpy float32 array)
15
+ output = tts(text)
16
+ audio = output["audio"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Write WAV to in-memory buffer
19
+ buf = io.BytesIO()
20
+ sf.write(buf, audio, 24000, format="WAV") # Bark uses 24kHz
21
+ buf.seek(0)
 
22
 
23
+ return StreamingResponse(buf, media_type="audio/wav")