asr-inference

Running on Zero

App Files Files Community

Sarah Solito commited on Oct 2

Commit

a7c6058

1 Parent(s): 2b7ff2f

Update: code cleaning

Browse files

Files changed (1) hide show

whisper_cs_dev.py +1 -51

whisper_cs_dev.py CHANGED Viewed

@@ -55,9 +55,6 @@ def load_cudnn():
 def get_settings():
-    if DEBUG_MODE:
-        print(f"Entering get_settings function...")
     is_cuda_available = torch.cuda.is_available()
     if is_cuda_available:
         device = "cuda"
@@ -68,7 +65,6 @@ def get_settings():
         compute_type = "default"
     if DEBUG_MODE: print(f"[SETTINGS] Device: {device}")
-    if DEBUG_MODE: print(f"Exited get_settings function.")
     return device, compute_type
@@ -77,7 +73,6 @@ def get_settings():
 def load_model(use_v2_fast, device, compute_type):
     if DEBUG_MODE:
-        print(f"Entering load_model function...")
         print(f"[MODEL LOADING] use_v2_fast: {use_v2_fast}")
     if use_v2_fast:
@@ -94,16 +89,12 @@ def load_model(use_v2_fast, device, compute_type):
             device=device,
             token=os.getenv("HF_TOKEN")
             )
-    if DEBUG_MODE: print(f"Exiting load_model function...")
     return model
 def split_input_stereo_channels(audio_path):
-    if DEBUG_MODE: print(f"Entering split_input_stereo_channels function...")
     ext = os.path.splitext(audio_path)[1].lower()
     if ext == ".wav":
@@ -121,10 +112,8 @@ def split_input_stereo_channels(audio_path):
     channels[0].export(RIGHT_CHANNEL_TEMP_PATH, format="wav")  # Right
     channels[1].export(LEFT_CHANNEL_TEMP_PATH, format="wav")  # Left
-    if DEBUG_MODE: print(f"Exited split_input_stereo_channels function.")
 def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
-    if DEBUG_MODE: print(f"Entering compute_type_to_audio_dtype function.")
     compute_type = compute_type.lower()
@@ -136,12 +125,10 @@ def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
     else:
         audio_np_dtype = np.float32
-    if DEBUG_MODE: print(f"Exited compute_type_to_audio_dtype function.")
     return audio_np_dtype
 def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
-    if DEBUG_MODE: print(f"Entering format_audio function...")
     input_audio, sample_rate = torchaudio.load(audio_path)
@@ -158,44 +145,31 @@ def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
     if DEBUG_MODE:
         print(f"[FORMAT AUDIO] Audio dtype for actual_compute_type: {input_audio.dtype}")
-        print(f"Exited format_audio function.")
     return input_audio
 def process_waveforms(device: str, compute_type: str):
-    if DEBUG_MODE: print(f"Entering process_waveforms function...")
     left_waveform  = format_audio(LEFT_CHANNEL_TEMP_PATH, compute_type, device)
     right_waveform = format_audio(RIGHT_CHANNEL_TEMP_PATH, compute_type, device)
-    if DEBUG_MODE: print(f"Exited process_waveforms function.")
     return left_waveform, right_waveform
 def transcribe_pipeline(audio, model):
-    if DEBUG_MODE: print(f"Entering transcribe_pipeline function.")
     text = model(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": TASK}, return_timestamps=True)["text"]
-    if DEBUG_MODE: print(f"Exited transcribe_pipeline function.")
     return text
 def transcribe_channels(left_waveform, right_waveform, model):
-    if DEBUG_MODE: print(f"Entering transcribe_channels function...")
     left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
     right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
     left_result = list(left_result)
     right_result = list(right_result)
-    if DEBUG_MODE: print(f"Exited transcribe_channels function.")
     return left_result, right_result
@@ -255,23 +229,17 @@ def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
 def get_segments(result, speaker_label):
-    if DEBUG_MODE: print(f"Entering get_segments function...")
     segments = result
     final_segments = [
         (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
         for seg in segments if seg.text
     ]
-    if DEBUG_MODE: print(f"EXited get_segments function.")
     return final_segments
 def post_process_transcripts(left_result, right_result):
-    if DEBUG_MODE: print(f"Entering post_process_transcripts function...")
     left_segs = get_segments(left_result, "Speaker 1")
     right_segs = get_segments(right_result, "Speaker 2")
@@ -285,29 +253,20 @@ def post_process_transcripts(left_result, right_result):
         clean_output += f"[{speaker}]: {text}\n"
     clean_output = clean_output.strip()
-    if DEBUG_MODE: print(f"Exited post_process_transcripts function.")
     return clean_output
 def cleanup_temp_files(*file_paths):
-    if DEBUG_MODE: print(f"Entered cleanup_temp_files function...")
     for path in file_paths:
         if path and os.path.exists(path):
             if DEBUG_MODE: print(f"Removing path: {path}")
             os.remove(path)
-    if DEBUG_MODE: print(f"Exited cleanup_temp_files function.")
 def generate(audio_path, use_v2_fast):
-    if DEBUG_MODE: print(f"Entering generate function...")
-    start = time.time()
     load_cudnn()
     device, requested_compute_type = get_settings()
@@ -333,13 +292,4 @@ def generate(audio_path, use_v2_fast):
         merged_results = transcribe_pipeline(audio, model)
         output = post_process_transcription(merged_results)
-    end = time.time()
-    audio_duration = torchaudio.info(audio_path).num_frames / torchaudio.info(audio_path).sample_rate
-    rtf = (end - start) / audio_duration
-    if DEBUG_MODE: print(f"[LATENCY]: {end - start}")
-    if DEBUG_MODE: print(f"[RTF]: {rtf:.2f}")
-    if DEBUG_MODE: print(f"Exited generate function.")
     return output

 def get_settings():
     is_cuda_available = torch.cuda.is_available()
     if is_cuda_available:
         device = "cuda"
         compute_type = "default"
     if DEBUG_MODE: print(f"[SETTINGS] Device: {device}")
     return device, compute_type
 def load_model(use_v2_fast, device, compute_type):
     if DEBUG_MODE:
         print(f"[MODEL LOADING] use_v2_fast: {use_v2_fast}")
     if use_v2_fast:
             device=device,
             token=os.getenv("HF_TOKEN")
             )
     return model
 def split_input_stereo_channels(audio_path):
     ext = os.path.splitext(audio_path)[1].lower()
     if ext == ".wav":
     channels[0].export(RIGHT_CHANNEL_TEMP_PATH, format="wav")  # Right
     channels[1].export(LEFT_CHANNEL_TEMP_PATH, format="wav")  # Left
 def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
     compute_type = compute_type.lower()
     else:
         audio_np_dtype = np.float32
     return audio_np_dtype
 def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
     input_audio, sample_rate = torchaudio.load(audio_path)
     if DEBUG_MODE:
         print(f"[FORMAT AUDIO] Audio dtype for actual_compute_type: {input_audio.dtype}")
     return input_audio
 def process_waveforms(device: str, compute_type: str):
     left_waveform  = format_audio(LEFT_CHANNEL_TEMP_PATH, compute_type, device)
     right_waveform = format_audio(RIGHT_CHANNEL_TEMP_PATH, compute_type, device)
     return left_waveform, right_waveform
 def transcribe_pipeline(audio, model):
     text = model(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": TASK}, return_timestamps=True)["text"]
     return text
 def transcribe_channels(left_waveform, right_waveform, model):
     left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
     right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
     left_result = list(left_result)
     right_result = list(right_result)
     return left_result, right_result
 def get_segments(result, speaker_label):
     segments = result
     final_segments = [
         (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
         for seg in segments if seg.text
     ]
     return final_segments
 def post_process_transcripts(left_result, right_result):
     left_segs = get_segments(left_result, "Speaker 1")
     right_segs = get_segments(right_result, "Speaker 2")
         clean_output += f"[{speaker}]: {text}\n"
     clean_output = clean_output.strip()
     return clean_output
 def cleanup_temp_files(*file_paths):
     for path in file_paths:
         if path and os.path.exists(path):
             if DEBUG_MODE: print(f"Removing path: {path}")
             os.remove(path)
 def generate(audio_path, use_v2_fast):
     load_cudnn()
     device, requested_compute_type = get_settings()
         merged_results = transcribe_pipeline(audio, model)
         output = post_process_transcription(merged_results)
     return output