Spaces:

Gregniuki
/

f5-tts_Polish_English_German

Running on Zero

App Files Files Community

Gregniuki commited on Nov 28, 2024

Commit

1ba672e

verified ·

1 Parent(s): 8380400

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -29

app.py CHANGED Viewed

@@ -175,40 +175,18 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     audio, sr = ref_audio
-# Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
-    audio_np = audio.cpu().numpy().astype(np.float16)
-# Convert NumPy audio array to PyDub AudioSegment
-    audio_segment = AudioSegment(
-        (audio_np * 32768).astype(np.int16).tobytes(),  # Scale to 16-bit PCM
-        frame_rate=sr,
-        sample_width=2,  # 16-bit audio
-        channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0]  # Mono or multi-channel
-    )
-# Remove silence using the custom function
-    audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
-# Convert trimmed audio back to a PyTorch tensor with dtype=float16
-    audio = torch.tensor(
-        np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
-        dtype=torch.float16
-    ).unsqueeze(0)  # Add batch/channel dimension
-# Normalize and resample
     if audio.shape[0] > 1:
-        audio = audio.mean(dim=0, keepdim=True)  # Convert to mono
-    rms = torch.sqrt((audio**2).mean())  # Compute RMS
     if rms < target_rms:
-        audio *= target_rms / rms  # Adjust RMS
     if sr != target_sample_rate:
-        audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
-    audio = audio.to(device)  # Move to target device
     generated_waves = []
@@ -341,6 +319,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
     gr.Info("Converting audio...")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
         non_silent_segs = silence.split_on_silence(
             aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000

     audio, sr = ref_audio
     if audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True)
+    rms = torch.sqrt(torch.mean(torch.square(audio)))
     if rms < target_rms:
+        audio = audio * target_rms / rms
     if sr != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
+        audio = resampler(audio)
+    audio = audio.to(device)
     generated_waves = []
     gr.Info("Converting audio...")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
+        aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
         non_silent_segs = silence.split_on_silence(
             aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000