Spaces:

Gregniuki
/

f5-tts_Polish_English_German

Running on Zero

Gregniuki commited on Nov 28, 2024

Commit

8380400

verified ·

1 Parent(s): 21a3092

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -175,33 +175,40 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     audio, sr = ref_audio
-    if audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0, keepdim=True)
-    rms = torch.sqrt(torch.mean(torch.square(audio)))
-    if rms < target_rms:
-        audio = audio * target_rms / rms
-    if sr != target_sample_rate:
-        resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
-        audio = resampler(audio)
-# Convert PyTorch tensor to PyDub's AudioSegment for silence removal
-    audio_np = audio.squeeze().cpu().numpy()
     audio_segment = AudioSegment(
-        audio_np.tobytes(),
-        frame_rate=sr,
-        sample_width=2,
-        channels=1
     )
-# Apply the custom silence removal function
-    audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
-# Convert back to PyTorch tensor
-    audio_np_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float16) / (2**15)
-    audio = torch.from_numpy(audio_np_trimmed).unsqueeze(0)
-    audio = audio.to(device)
     generated_waves = []

     audio, sr = ref_audio
+# Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
+    audio_np = audio.cpu().numpy().astype(np.float16)
+# Convert NumPy audio array to PyDub AudioSegment
     audio_segment = AudioSegment(
+        (audio_np * 32768).astype(np.int16).tobytes(),  # Scale to 16-bit PCM
+        frame_rate=sr,
+        sample_width=2,  # 16-bit audio
+        channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0]  # Mono or multi-channel
     )
+# Remove silence using the custom function
+    audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
+# Convert trimmed audio back to a PyTorch tensor with dtype=float16
+    audio = torch.tensor(
+        np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
+        dtype=torch.float16
+    ).unsqueeze(0)  # Add batch/channel dimension
+# Normalize and resample
+    if audio.shape[0] > 1:
+        audio = audio.mean(dim=0, keepdim=True)  # Convert to mono
+    rms = torch.sqrt((audio**2).mean())  # Compute RMS
+    if rms < target_rms:
+        audio *= target_rms / rms  # Adjust RMS
+    if sr != target_sample_rate:
+        audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
+    audio = audio.to(device)  # Move to target device
     generated_waves = []