Spaces:

Gregniuki
/

f5-tts_Polish_English_German

Running on Zero

Gregniuki commited on Nov 28, 2024

Commit

f387293

verified ·

1 Parent(s): 4d14e15

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -176,38 +176,38 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     audio, sr = ref_audio
-# Convert audio to PyDub AudioSegment (assuming it's a NumPy array in [-1, 1] range)
     audio_segment = AudioSegment(
-        (audio * (2**15)).astype(np.int16).tobytes(),  # Scale to 16-bit PCM range
         frame_rate=sr,
         sample_width=2,  # 16-bit audio
-        channels=1 if len(audio.shape) == 1 else audio.shape[0]  # Mono or multi-channel
     )
-# Remove silence using your custom function
-    audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
-# Convert back to NumPy array for further processing
-    audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
-# Continue processing with trimmed audio
-    audio = torch.from_numpy(audio_trimmed).unsqueeze(0)  # Add batch/channel dimension
     if audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0, keepdim=True)
-    rms = torch.sqrt(torch.mean(torch.square(audio)))
     if rms < target_rms:
-        audio = audio * target_rms / rms
     if sr != target_sample_rate:
-        resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
-        audio = resampler(audio)
-    audio = audio.to(device)
     generated_waves = []
     spectrograms = []

     audio, sr = ref_audio
+# Convert PyTorch tensor to NumPy array before scaling and processing
+    audio_np = audio.cpu().numpy()  # Convert to NumPy (if it's a tensor)
+# Convert NumPy audio array to PyDub AudioSegment
     audio_segment = AudioSegment(
+        (audio_np * 32768).astype(np.int16).tobytes(),  # Scale to 16-bit PCM
         frame_rate=sr,
         sample_width=2,  # 16-bit audio
+        channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0]  # Mono or multi-channel
     )
+# Remove silence using the custom function
+    audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
+# Convert trimmed audio back to a PyTorch tensor
+    audio = torch.tensor(
+        np.array(audio_trimmed.get_array_of_samples(), dtype=np.float32) / 32768
+    ).unsqueeze(0)  # Add batch/channel dimension
+# Normalize and resample
     if audio.shape[0] > 1:
+        audio = audio.mean(dim=0, keepdim=True)  # Convert to mono
+    rms = torch.sqrt((audio**2).mean())  # Compute RMS
     if rms < target_rms:
+        audio *= target_rms / rms  # Adjust RMS
     if sr != target_sample_rate:
+        audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
+    audio = audio.to(device)  # Move to target device
     generated_waves = []
     spectrograms = []