Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -176,38 +176,38 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, | |
| 176 |  | 
| 177 | 
             
                audio, sr = ref_audio
         | 
| 178 |  | 
| 179 | 
            -
            # Convert  | 
|  | |
|  | |
|  | |
| 180 | 
             
                audio_segment = AudioSegment(
         | 
| 181 | 
            -
                    ( | 
| 182 | 
             
                    frame_rate=sr,
         | 
| 183 | 
             
                    sample_width=2,  # 16-bit audio
         | 
| 184 | 
            -
                    channels=1 if len( | 
| 185 | 
             
                )
         | 
| 186 |  | 
| 187 | 
            -
            # Remove silence using  | 
| 188 | 
            -
                 | 
| 189 | 
            -
             | 
| 190 | 
            -
            # Convert back to NumPy array for further processing
         | 
| 191 | 
            -
                audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
         | 
| 192 | 
            -
             | 
| 193 | 
            -
            # Continue processing with trimmed audio
         | 
| 194 | 
            -
                audio = torch.from_numpy(audio_trimmed).unsqueeze(0)  # Add batch/channel dimension
         | 
| 195 | 
            -
             | 
| 196 |  | 
|  | |
|  | |
|  | |
|  | |
| 197 |  | 
| 198 | 
            -
             | 
| 199 | 
             
                if audio.shape[0] > 1:
         | 
| 200 | 
            -
                    audio =  | 
| 201 |  | 
| 202 | 
            -
                rms = torch.sqrt( | 
| 203 | 
             
                if rms < target_rms:
         | 
| 204 | 
            -
                    audio  | 
|  | |
| 205 | 
             
                if sr != target_sample_rate:
         | 
| 206 | 
            -
                     | 
| 207 | 
            -
             | 
| 208 | 
            -
                
         | 
| 209 |  | 
| 210 | 
            -
                audio = audio.to(device)
         | 
| 211 |  | 
| 212 | 
             
                generated_waves = []
         | 
| 213 | 
             
                spectrograms = []
         | 
|  | |
| 176 |  | 
| 177 | 
             
                audio, sr = ref_audio
         | 
| 178 |  | 
| 179 | 
            +
            # Convert PyTorch tensor to NumPy array before scaling and processing
         | 
| 180 | 
            +
                audio_np = audio.cpu().numpy()  # Convert to NumPy (if it's a tensor)
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            # Convert NumPy audio array to PyDub AudioSegment
         | 
| 183 | 
             
                audio_segment = AudioSegment(
         | 
| 184 | 
            +
                    (audio_np * 32768).astype(np.int16).tobytes(),  # Scale to 16-bit PCM
         | 
| 185 | 
             
                    frame_rate=sr,
         | 
| 186 | 
             
                    sample_width=2,  # 16-bit audio
         | 
| 187 | 
            +
                    channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0]  # Mono or multi-channel
         | 
| 188 | 
             
                )
         | 
| 189 |  | 
| 190 | 
            +
            # Remove silence using the custom function
         | 
| 191 | 
            +
                audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 192 |  | 
| 193 | 
            +
            # Convert trimmed audio back to a PyTorch tensor
         | 
| 194 | 
            +
                audio = torch.tensor(
         | 
| 195 | 
            +
                    np.array(audio_trimmed.get_array_of_samples(), dtype=np.float32) / 32768
         | 
| 196 | 
            +
                ).unsqueeze(0)  # Add batch/channel dimension
         | 
| 197 |  | 
| 198 | 
            +
            # Normalize and resample
         | 
| 199 | 
             
                if audio.shape[0] > 1:
         | 
| 200 | 
            +
                    audio = audio.mean(dim=0, keepdim=True)  # Convert to mono
         | 
| 201 |  | 
| 202 | 
            +
                rms = torch.sqrt((audio**2).mean())  # Compute RMS
         | 
| 203 | 
             
                if rms < target_rms:
         | 
| 204 | 
            +
                    audio *= target_rms / rms  # Adjust RMS
         | 
| 205 | 
            +
             | 
| 206 | 
             
                if sr != target_sample_rate:
         | 
| 207 | 
            +
                    audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                audio = audio.to(device)  # Move to target device
         | 
| 210 |  | 
|  | |
| 211 |  | 
| 212 | 
             
                generated_waves = []
         | 
| 213 | 
             
                spectrograms = []
         |