Gregniuki commited on
Commit
8380400
·
verified ·
1 Parent(s): 21a3092

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -20
app.py CHANGED
@@ -175,33 +175,40 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
175
 
176
 
177
  audio, sr = ref_audio
178
- if audio.shape[0] > 1:
179
- audio = torch.mean(audio, dim=0, keepdim=True)
180
 
181
- rms = torch.sqrt(torch.mean(torch.square(audio)))
182
- if rms < target_rms:
183
- audio = audio * target_rms / rms
184
- if sr != target_sample_rate:
185
- resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
186
- audio = resampler(audio)
187
 
188
- # Convert PyTorch tensor to PyDub's AudioSegment for silence removal
189
- audio_np = audio.squeeze().cpu().numpy()
190
  audio_segment = AudioSegment(
191
- audio_np.tobytes(),
192
- frame_rate=sr,
193
- sample_width=2,
194
- channels=1
195
  )
196
 
197
- # Apply the custom silence removal function
198
- audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- # Convert back to PyTorch tensor
201
- audio_np_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float16) / (2**15)
202
- audio = torch.from_numpy(audio_np_trimmed).unsqueeze(0)
203
 
204
- audio = audio.to(device)
205
 
206
 
207
  generated_waves = []
 
175
 
176
 
177
  audio, sr = ref_audio
 
 
178
 
179
+ # Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
180
+ audio_np = audio.cpu().numpy().astype(np.float16)
 
 
 
 
181
 
182
+ # Convert NumPy audio array to PyDub AudioSegment
 
183
  audio_segment = AudioSegment(
184
+ (audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
185
+ frame_rate=sr,
186
+ sample_width=2, # 16-bit audio
187
+ channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
188
  )
189
 
190
+ # Remove silence using the custom function
191
+ audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
192
+
193
+ # Convert trimmed audio back to a PyTorch tensor with dtype=float16
194
+ audio = torch.tensor(
195
+ np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
196
+ dtype=torch.float16
197
+ ).unsqueeze(0) # Add batch/channel dimension
198
+
199
+ # Normalize and resample
200
+ if audio.shape[0] > 1:
201
+ audio = audio.mean(dim=0, keepdim=True) # Convert to mono
202
+
203
+ rms = torch.sqrt((audio**2).mean()) # Compute RMS
204
+ if rms < target_rms:
205
+ audio *= target_rms / rms # Adjust RMS
206
+
207
+ if sr != target_sample_rate:
208
+ audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
209
 
210
+ audio = audio.to(device) # Move to target device
 
 
211
 
 
212
 
213
 
214
  generated_waves = []