Spaces:
Paused
Paused
Amamrnaf
commited on
Commit
·
f023da7
1
Parent(s):
c45f1ab
changes
Browse files- coqui_tts.py +22 -3
- metaVoice.py +30 -0
coqui_tts.py
CHANGED
|
@@ -11,13 +11,32 @@ import shutil
|
|
| 11 |
import pyloudnorm as pyln
|
| 12 |
import torch
|
| 13 |
from TTS.api import TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
-
def run_audio_generation_v1(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
gpu = True if torch.cuda.is_available() else False
|
| 19 |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=gpu) # gpu should be true when server (cuda)
|
| 20 |
-
|
|
|
|
| 21 |
# pre-process story audio file
|
| 22 |
# convert to 16 bit mono
|
| 23 |
# remove noise
|
|
@@ -26,7 +45,7 @@ def run_audio_generation_v1(text,accent='None'):
|
|
| 26 |
sf.write('./tmp/audio/speaker_wav.wav', speaker_wav_data_no_noise, speaker_wav_rate, subtype='PCM_16')
|
| 27 |
|
| 28 |
tts.tts_to_file(
|
| 29 |
-
|
| 30 |
speaker_wav="./tmp/audio/speaker_wav.wav",
|
| 31 |
language="en",
|
| 32 |
file_path="audio/output.wav"
|
|
|
|
| 11 |
import pyloudnorm as pyln
|
| 12 |
import torch
|
| 13 |
from TTS.api import TTS
|
| 14 |
+
import string
|
| 15 |
+
|
| 16 |
+
def remove_punctuation(sentence):
|
| 17 |
+
translator = str.maketrans('', '', string.punctuation)
|
| 18 |
+
sentence = sentence.translate(translator)
|
| 19 |
+
|
| 20 |
+
# Remove line breaks
|
| 21 |
+
sentence = sentence.replace('\n', ' ').replace('\r', '')
|
| 22 |
|
| 23 |
+
return sentence
|
| 24 |
|
| 25 |
|
| 26 |
+
def run_audio_generation_v1(new_text,accent='None'):
|
| 27 |
+
|
| 28 |
+
new_text = new_text.replace('\n', ' ').replace('\r', '')
|
| 29 |
+
new_text_mod = remove_punctuation(new_text)
|
| 30 |
+
|
| 31 |
+
new_text_split = new_text_mod.split()
|
| 32 |
+
for word in new_text_split:
|
| 33 |
+
if len(word)>=2 and word.isupper():
|
| 34 |
+
new_text = new_text.replace(word, " ".join([*word]))
|
| 35 |
+
|
| 36 |
gpu = True if torch.cuda.is_available() else False
|
| 37 |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=gpu) # gpu should be true when server (cuda)
|
| 38 |
+
# if not gpu:
|
| 39 |
+
|
| 40 |
# pre-process story audio file
|
| 41 |
# convert to 16 bit mono
|
| 42 |
# remove noise
|
|
|
|
| 45 |
sf.write('./tmp/audio/speaker_wav.wav', speaker_wav_data_no_noise, speaker_wav_rate, subtype='PCM_16')
|
| 46 |
|
| 47 |
tts.tts_to_file(
|
| 48 |
+
new_text,
|
| 49 |
speaker_wav="./tmp/audio/speaker_wav.wav",
|
| 50 |
language="en",
|
| 51 |
file_path="audio/output.wav"
|
metaVoice.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fam.llm.fast_inference import TTS
|
| 2 |
+
import string
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
|
| 5 |
+
def remove_punctuation(sentence):
|
| 6 |
+
translator = str.maketrans('', '', string.punctuation)
|
| 7 |
+
sentence = sentence.translate(translator)
|
| 8 |
+
|
| 9 |
+
# Remove line breaks
|
| 10 |
+
sentence = sentence.replace('\n', ' ').replace('\r', '')
|
| 11 |
+
|
| 12 |
+
return sentence
|
| 13 |
+
|
| 14 |
+
def run_audio_generation_v2(new_text,accent='None'):
|
| 15 |
+
tts = TTS()
|
| 16 |
+
new_text = new_text.replace('\n', ' ').replace('\r', '')
|
| 17 |
+
new_text_mod = remove_punctuation(new_text)
|
| 18 |
+
|
| 19 |
+
new_text_split = new_text_mod.split()
|
| 20 |
+
for word in new_text_split:
|
| 21 |
+
if len(word)>=2 and word.isupper():
|
| 22 |
+
new_text = new_text.replace(word, " ".join([*word]))
|
| 23 |
+
|
| 24 |
+
wav_file = tts.synthesise(
|
| 25 |
+
text=new_text,
|
| 26 |
+
spk_ref_path="./tmp/audio/speaker_wav.wav" # you can use any speaker reference file (WAV, OGG, MP3, FLAC, etc.)
|
| 27 |
+
)
|
| 28 |
+
sf.write('audio/output.wav', wav_file, samplerate=22050)
|
| 29 |
+
|
| 30 |
+
|