from faster_whisper import WhisperModel from transformers import pipeline import os from settings import MODEL_PATH_AGE_GENDER, MODEL_PATH_METEO, MODEL_PATH_V2_FAST, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH from audio_utils import debug_print, get_settings, split_input_stereo_channels, process_waveforms, post_process_transcripts, post_merge_consecutive_segments_from_text, cleanup_temp_files from shout_detector import shout from silence_detector import silence from meteo_detector import classify_meteo_event from age_gender_detector import age_gender, WavLMWrapper hf_token = os.getenv("HF_AUTH_TOKEN") ASR_MODEL = None AGE_GENDER_MODEL = None METEO_MODEL = None def get_asr_model(DEVICE, COMPUTE_TYPE): global ASR_MODEL if ASR_MODEL is None: debug_print("[MODEL LOADING]Loading ASR model...") ASR_MODEL = WhisperModel( MODEL_PATH_V2_FAST, device=DEVICE, compute_type=COMPUTE_TYPE ) debug_print("[MODEL LOADING]ASR model loaded") return ASR_MODEL def get_age_gender_model(DEVICE): global AGE_GENDER_MODEL if AGE_GENDER_MODEL is None: debug_print("[MODEL LOADING]Loading Age/Gender model...") AGE_GENDER_MODEL = WavLMWrapper.from_pretrained(MODEL_PATH_AGE_GENDER).to(DEVICE) AGE_GENDER_MODEL.eval() debug_print("[MODEL LOADING]Age/Gender model loaded") return AGE_GENDER_MODEL def get_meteo_model(DEVICE): global METEO_MODEL if METEO_MODEL is None: debug_print("[MODEL LOADING]Loading Meteo model...") METEO_MODEL = pipeline( task="text-classification", model=MODEL_PATH_METEO, tokenizer=MODEL_PATH_METEO, top_k=None, device=0 if DEVICE == "cuda" else -1, token=hf_token ) debug_print("[MODEL LOADING]Meteo model loaded") return METEO_MODEL def transcribe_faster_asr(left_waveform, right_waveform, model): left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe") right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe") return list(left_result), list(right_result) def generate_fase_2(audio_path, model_version, civil_channel): DEVICE, COMPUTE_TYPE = get_settings() asr_model = get_asr_model(DEVICE, COMPUTE_TYPE) age_gender_model = get_age_gender_model(DEVICE) meteo_model = get_meteo_model(DEVICE) actual_compute_type = asr_model.model.compute_type debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}") split_input_stereo_channels(audio_path) left_waveform, right_waveform = process_waveforms(DEVICE, actual_compute_type) debug_print(f"[SETTINGS] Civil channel: {civil_channel}") left_result, right_result = transcribe_faster_asr(left_waveform, right_waveform, asr_model) silence_event = silence(audio_path) civil_waveform = left_waveform if civil_channel == "Left" else right_waveform civil_path = LEFT_CHANNEL_TEMP_PATH if civil_channel == "Left" else RIGHT_CHANNEL_TEMP_PATH shout_event = shout(civil_path) age, sex, age_group = age_gender(civil_waveform, age_gender_model, DEVICE) age = f"{age_group} (aprox. {age} aƱos)" clean_output_asr, clean_output_meteo = post_process_transcripts(left_result, right_result, civil_channel) text = '\n' + clean_output_asr meteo_event = classify_meteo_event(clean_output_meteo, meteo_model, threshold=0.0) cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH) return text, sex, age, silence_event, shout_event, meteo_event