Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -60,7 +60,7 @@ print(f"Using device: {device}, dtype: {dtype}") | |
| 60 |  | 
| 61 | 
             
            pipe = pipeline(
         | 
| 62 | 
             
                "automatic-speech-recognition",
         | 
| 63 | 
            -
                model=" | 
| 64 | 
             
                torch_dtype=torch.float16,
         | 
| 65 | 
             
                device=device,
         | 
| 66 | 
             
            )
         | 
| @@ -110,8 +110,8 @@ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None): | |
| 110 | 
             
                return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
         | 
| 111 |  | 
| 112 |  | 
| 113 | 
            -
            F2TTS_ema_model3 = load_f5tts()
         | 
| 114 | 
            -
            E2TTS_ema_model4 = load_e2tts() if USING_SPACES else None
         | 
| 115 | 
             
            custom_ema_model, pre_custom_path = None, ""
         | 
| 116 |  | 
| 117 | 
             
            chat_model_state = None
         | 
| @@ -212,7 +212,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, | |
| 212 | 
             
                    gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
         | 
| 213 |  | 
| 214 | 
             
                # Calculate duration based on the lengths of ref_text and gen_text
         | 
| 215 | 
            -
                    duration = min(2000, max(270, int( | 
| 216 |  | 
| 217 | 
             
                # Print the calculated duration
         | 
| 218 | 
             
                    print(f"Duration: {duration} seconds")
         | 
|  | |
| 60 |  | 
| 61 | 
             
            pipe = pipeline(
         | 
| 62 | 
             
                "automatic-speech-recognition",
         | 
| 63 | 
            +
                model="openai/whisper-large-v3-turbo",
         | 
| 64 | 
             
                torch_dtype=torch.float16,
         | 
| 65 | 
             
                device=device,
         | 
| 66 | 
             
            )
         | 
|  | |
| 110 | 
             
                return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
         | 
| 111 |  | 
| 112 |  | 
| 113 | 
            +
            #F2TTS_ema_model3 = load_f5tts()
         | 
| 114 | 
            +
            #E2TTS_ema_model4 = load_e2tts() if USING_SPACES else None
         | 
| 115 | 
             
            custom_ema_model, pre_custom_path = None, ""
         | 
| 116 |  | 
| 117 | 
             
            chat_model_state = None
         | 
|  | |
| 212 | 
             
                    gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
         | 
| 213 |  | 
| 214 | 
             
                # Calculate duration based on the lengths of ref_text and gen_text
         | 
| 215 | 
            +
                    duration = min(2000, max(270, int( (ref_audio_len + (ref_audio_len / ref_text_len * gen_text_len / speed))))
         | 
| 216 |  | 
| 217 | 
             
                # Print the calculated duration
         | 
| 218 | 
             
                    print(f"Duration: {duration} seconds")
         |