Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -175,33 +175,40 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, | |
| 175 |  | 
| 176 |  | 
| 177 | 
             
                audio, sr = ref_audio
         | 
| 178 | 
            -
                if audio.shape[0] > 1:
         | 
| 179 | 
            -
                    audio = torch.mean(audio, dim=0, keepdim=True)
         | 
| 180 |  | 
| 181 | 
            -
             | 
| 182 | 
            -
                 | 
| 183 | 
            -
                    audio = audio * target_rms / rms
         | 
| 184 | 
            -
                if sr != target_sample_rate:
         | 
| 185 | 
            -
                    resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
         | 
| 186 | 
            -
                    audio = resampler(audio)
         | 
| 187 |  | 
| 188 | 
            -
            # Convert  | 
| 189 | 
            -
                audio_np = audio.squeeze().cpu().numpy()
         | 
| 190 | 
             
                audio_segment = AudioSegment(
         | 
| 191 | 
            -
                    audio_np.tobytes(), 
         | 
| 192 | 
            -
                    frame_rate=sr, | 
| 193 | 
            -
                    sample_width=2, 
         | 
| 194 | 
            -
                    channels=1
         | 
| 195 | 
             
                )
         | 
| 196 |  | 
| 197 | 
            -
            #  | 
| 198 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 199 |  | 
| 200 | 
            -
            #  | 
| 201 | 
            -
                audio_np_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float16) / (2**15)
         | 
| 202 | 
            -
                audio = torch.from_numpy(audio_np_trimmed).unsqueeze(0)
         | 
| 203 |  | 
| 204 | 
            -
                audio = audio.to(device)
         | 
| 205 |  | 
| 206 |  | 
| 207 | 
             
                generated_waves = []
         | 
|  | |
| 175 |  | 
| 176 |  | 
| 177 | 
             
                audio, sr = ref_audio
         | 
|  | |
|  | |
| 178 |  | 
| 179 | 
            +
            # Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
         | 
| 180 | 
            +
                audio_np = audio.cpu().numpy().astype(np.float16)
         | 
|  | |
|  | |
|  | |
|  | |
| 181 |  | 
| 182 | 
            +
            # Convert NumPy audio array to PyDub AudioSegment
         | 
|  | |
| 183 | 
             
                audio_segment = AudioSegment(
         | 
| 184 | 
            +
                    (audio_np * 32768).astype(np.int16).tobytes(),  # Scale to 16-bit PCM
         | 
| 185 | 
            +
                    frame_rate=sr,
         | 
| 186 | 
            +
                    sample_width=2,  # 16-bit audio
         | 
| 187 | 
            +
                    channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0]  # Mono or multi-channel
         | 
| 188 | 
             
                )
         | 
| 189 |  | 
| 190 | 
            +
            # Remove silence using the custom function
         | 
| 191 | 
            +
                audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
         | 
| 192 | 
            +
             | 
| 193 | 
            +
            # Convert trimmed audio back to a PyTorch tensor with dtype=float16
         | 
| 194 | 
            +
                audio = torch.tensor(
         | 
| 195 | 
            +
                    np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
         | 
| 196 | 
            +
                    dtype=torch.float16
         | 
| 197 | 
            +
                ).unsqueeze(0)  # Add batch/channel dimension
         | 
| 198 | 
            +
             | 
| 199 | 
            +
            # Normalize and resample
         | 
| 200 | 
            +
                if audio.shape[0] > 1:
         | 
| 201 | 
            +
                    audio = audio.mean(dim=0, keepdim=True)  # Convert to mono
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                rms = torch.sqrt((audio**2).mean())  # Compute RMS
         | 
| 204 | 
            +
                if rms < target_rms:
         | 
| 205 | 
            +
                    audio *= target_rms / rms  # Adjust RMS
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                if sr != target_sample_rate:
         | 
| 208 | 
            +
                    audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
         | 
| 209 |  | 
| 210 | 
            +
                audio = audio.to(device)  # Move to target device
         | 
|  | |
|  | |
| 211 |  | 
|  | |
| 212 |  | 
| 213 |  | 
| 214 | 
             
                generated_waves = []
         |