Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -38,9 +38,6 @@ def parse_multilingual_text(input_text):
|
|
| 38 |
def generate_segment_audio(text, lang, speaker_url, pipe):
|
| 39 |
if not isinstance(text, str):
|
| 40 |
text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
|
| 41 |
-
|
| 42 |
-
# Generating stoks (tokens<pl>) from text
|
| 43 |
-
# stoks = pipe.t2s.generate([text], lang=[lang])
|
| 44 |
audio_data = pipe.generate(text, speaker_url, lang)
|
| 45 |
resample_audio = resampler(newsr=24000)
|
| 46 |
audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
|
|
@@ -48,33 +45,10 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
|
|
| 48 |
print("Shape after resampling:", audio_np.shape) # Debug statement
|
| 49 |
return audio_np
|
| 50 |
|
| 51 |
-
# Function to append and concatenate audio segments with padding
|
| 52 |
def concatenate_audio_segments(segments):
|
| 53 |
-
# # Determine the length of the longest segment
|
| 54 |
-
# max_length = max(seg.shape[0] for seg in segments)
|
| 55 |
-
# print("Max length of segments:", max_length) # Debug statement
|
| 56 |
-
# # Pad each segment to the length of the longest segment and stack them
|
| 57 |
-
# padded_segments = []
|
| 58 |
-
# for seg in segments:
|
| 59 |
-
# # Check if the segment is stereo; if not, convert it to stereo
|
| 60 |
-
# if seg.ndim == 1 or seg.shape[1] == 1:
|
| 61 |
-
# stereo_segment = np.stack((seg, seg), axis=-1)
|
| 62 |
-
# else:
|
| 63 |
-
# stereo_segment = seg
|
| 64 |
-
|
| 65 |
-
# Pad the segment to the max length
|
| 66 |
-
# padding_length = max_length - stereo_segment.shape[0]
|
| 67 |
-
# padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
|
| 68 |
-
# print("Padded segment shape:", padded_segment.shape) # Debug statement
|
| 69 |
-
# padded_segments.append(padded_segment)
|
| 70 |
-
|
| 71 |
concatenated_audio = np.concatenate(segments , axis=1)
|
| 72 |
-
|
| 73 |
-
print("Concatenated audio shape:", concatenated_audio.shape) # Debug statement
|
| 74 |
-
# concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
| 75 |
return concatenated_audio
|
| 76 |
|
| 77 |
-
# The rest of the code in app.py remains the same
|
| 78 |
|
| 79 |
@spaces.GPU
|
| 80 |
def whisper_speech_demo(multilingual_text, speaker_audio):
|
|
@@ -94,10 +68,8 @@ def whisper_speech_demo(multilingual_text, speaker_audio):
|
|
| 94 |
|
| 95 |
concatenated_audio = concatenate_audio_segments(audio_segments)
|
| 96 |
print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
|
| 97 |
-
# Normalize the concatenated audio
|
| 98 |
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
| 99 |
|
| 100 |
-
# Write the audio data to a temporary file and return the file path
|
| 101 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
| 102 |
sf.write(tmp_file.name, concatenated_audio.T, 24000, format='WAV', subtype='PCM_16')
|
| 103 |
return tmp_file.name
|
|
|
|
| 38 |
def generate_segment_audio(text, lang, speaker_url, pipe):
|
| 39 |
if not isinstance(text, str):
|
| 40 |
text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
|
|
|
|
|
|
|
|
|
|
| 41 |
audio_data = pipe.generate(text, speaker_url, lang)
|
| 42 |
resample_audio = resampler(newsr=24000)
|
| 43 |
audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
|
|
|
|
| 45 |
print("Shape after resampling:", audio_np.shape) # Debug statement
|
| 46 |
return audio_np
|
| 47 |
|
|
|
|
| 48 |
def concatenate_audio_segments(segments):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
concatenated_audio = np.concatenate(segments , axis=1)
|
|
|
|
|
|
|
|
|
|
| 50 |
return concatenated_audio
|
| 51 |
|
|
|
|
| 52 |
|
| 53 |
@spaces.GPU
|
| 54 |
def whisper_speech_demo(multilingual_text, speaker_audio):
|
|
|
|
| 68 |
|
| 69 |
concatenated_audio = concatenate_audio_segments(audio_segments)
|
| 70 |
print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
|
|
|
|
| 71 |
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
| 72 |
|
|
|
|
| 73 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
| 74 |
sf.write(tmp_file.name, concatenated_audio.T, 24000, format='WAV', subtype='PCM_16')
|
| 75 |
return tmp_file.name
|