Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -175,40 +175,18 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 175 |
|
| 176 |
|
| 177 |
audio, sr = ref_audio
|
| 178 |
-
|
| 179 |
-
# Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
|
| 180 |
-
audio_np = audio.cpu().numpy().astype(np.float16)
|
| 181 |
-
|
| 182 |
-
# Convert NumPy audio array to PyDub AudioSegment
|
| 183 |
-
audio_segment = AudioSegment(
|
| 184 |
-
(audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
|
| 185 |
-
frame_rate=sr,
|
| 186 |
-
sample_width=2, # 16-bit audio
|
| 187 |
-
channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
|
| 188 |
-
)
|
| 189 |
-
|
| 190 |
-
# Remove silence using the custom function
|
| 191 |
-
audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
|
| 192 |
-
|
| 193 |
-
# Convert trimmed audio back to a PyTorch tensor with dtype=float16
|
| 194 |
-
audio = torch.tensor(
|
| 195 |
-
np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
|
| 196 |
-
dtype=torch.float16
|
| 197 |
-
).unsqueeze(0) # Add batch/channel dimension
|
| 198 |
-
|
| 199 |
-
# Normalize and resample
|
| 200 |
if audio.shape[0] > 1:
|
| 201 |
-
audio =
|
| 202 |
|
| 203 |
-
rms = torch.sqrt(
|
| 204 |
if rms < target_rms:
|
| 205 |
-
audio
|
| 206 |
-
|
| 207 |
if sr != target_sample_rate:
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
|
|
|
|
| 212 |
|
| 213 |
|
| 214 |
generated_waves = []
|
|
@@ -341,6 +319,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
| 341 |
gr.Info("Converting audio...")
|
| 342 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 343 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
|
|
|
| 344 |
|
| 345 |
non_silent_segs = silence.split_on_silence(
|
| 346 |
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
|
|
|
|
| 175 |
|
| 176 |
|
| 177 |
audio, sr = ref_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
if audio.shape[0] > 1:
|
| 179 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
| 180 |
|
| 181 |
+
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
| 182 |
if rms < target_rms:
|
| 183 |
+
audio = audio * target_rms / rms
|
|
|
|
| 184 |
if sr != target_sample_rate:
|
| 185 |
+
resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
|
| 186 |
+
audio = resampler(audio)
|
| 187 |
+
|
| 188 |
|
| 189 |
+
audio = audio.to(device)
|
| 190 |
|
| 191 |
|
| 192 |
generated_waves = []
|
|
|
|
| 319 |
gr.Info("Converting audio...")
|
| 320 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 321 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
| 322 |
+
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
| 323 |
|
| 324 |
non_silent_segs = silence.split_on_silence(
|
| 325 |
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
|