asr-inference

Running on Zero

App Files Files Community

AbirMessaoudi commited on 4 days ago

Commit

1619dcb

verified ·

1 Parent(s): 3d88604

fase_1, fase_2 releases (#46)

Browse files

- Fase_1 and Fase_2 releases, code cleaned (d6fb6a283d102ccaf8f654e51575987d4045b6d6)

Files changed (13) hide show

README.md +2 -2
age_gender_detector.py +299 -0
app.py +94 -25
whisper_cs_dev.py → audio_utils.py +50 -158
meteo_detector.py +12 -0
requirements.txt +2 -2
requirements_dev.txt +0 -171
settings.py +5 -0
shout_detector.py +148 -0
silence_detector.py +44 -0
whisper_cs.py +0 -382
whisper_cs_fase_1.py +75 -0
whisper_cs_fase_2.py +89 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🤫
 colorFrom: indigo
 colorTo: red
 sdk: gradio
-sdk_version: 5.41.1
 app_file: app.py
 pinned: false
 tags:
@@ -89,7 +89,7 @@ Per descarregar i córrer la imatge de docker:
 ```
 docker run -d -p 7860:7860 --name asr-inference --platform=linux/amd64 \
-	registry.hf.space/bsc-lt-asr-inference:latest python app.py
 ```

 colorFrom: indigo
 colorTo: red
 sdk: gradio
+sdk_version: 4.20.0
 app_file: app.py
 pinned: false
 tags:
 ```
 docker run -d -p 7860:7860 --name asr-inference --platform=linux/amd64 \
+	registry.hf.space/projecte-aina-asr-inference:latest python app.py
 ```

age_gender_detector.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch import tensor
+from transformers import Wav2Vec2FeatureExtractor, WavLMModel
+import transformers.models.wavlm.modeling_wavlm as wavlm
+from huggingface_hub import PyTorchModelHubMixin
+from speechbrain.lobes.models.huggingface_transformers.huggingface import make_padding_masks
+class RevGrad(Function):
+    @staticmethod
+    def forward(ctx, input_, alpha_):
+        ctx.save_for_backward(input_, alpha_)
+        return input_
+    @staticmethod
+    def backward(ctx, grad_output):
+        _, alpha_ = ctx.saved_tensors
+        grad_input = -grad_output * alpha_ if ctx.needs_input_grad[0] else None
+        return grad_input, None
+revgrad = RevGrad.apply
+class RevGradLayer(nn.Module):
+    def __init__(self, alpha=1.):
+        super().__init__()
+        self._alpha = tensor(alpha, requires_grad=False)
+    def forward(self, x):
+        return revgrad(x, self._alpha)
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states, position_bias)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+        outputs = (hidden_states, position_bias)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class WavLMWrapper(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        pretrain_model="wavlm_large",
+        hidden_dim=256,
+        freeze_params=True,
+        output_class_num=4,
+        use_conv_output=True,
+        apply_reg=False
+    ):
+        super().__init__()
+        self.pretrain_model = pretrain_model
+        self.use_conv_output = use_conv_output
+        # Load backbone
+        if self.pretrain_model == "wavlm":
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-base-plus",
+                output_hidden_states=True,
+            )
+        elif self.pretrain_model == "wavlm_large":
+            self.processor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-large')
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-large",
+                output_hidden_states=True,
+            )
+        # Keep original encoder layers (no LoRA)
+        state_dict = self.backbone_model.state_dict()
+        self.model_config = self.backbone_model.config
+        if self.pretrain_model == "wavlm":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayer(i, self.model_config, has_relative_position_bias=(i == 0))
+                 for i in range(self.model_config.num_hidden_layers)]
+            )
+        else:
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayerStableLayerNorm(i, self.model_config, has_relative_position_bias=(i == 0))
+                 for i in range(self.model_config.num_hidden_layers)]
+            )
+        self.backbone_model.load_state_dict(state_dict, strict=False)
+        # Freeze weights if requested
+        if freeze_params:
+            for p in self.backbone_model.parameters():
+                p.requires_grad = False
+        # Conv projection layers
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1)
+        )
+        # Layer weights
+        num_layers = self.model_config.num_hidden_layers + 1 if use_conv_output else self.model_config.num_hidden_layers
+        self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        # Output heads
+        if apply_reg:
+            self.age_dist_layer = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 1),
+                nn.Sigmoid()
+            )
+        else:
+            self.age_dist_layer = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 7)
+            )
+        self.sex_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 2)
+        )
+    def forward(self, x, length=None, return_feature=False, pred="age_dist_sex"):
+        # Feature extraction
+        if self.pretrain_model == "wavlm_large":
+            with torch.no_grad():
+                signal, attention_mask = [], []
+                if length is not None:
+                    attention_mask = make_padding_masks(x, wav_len=length/length.max()).to(x.device)
+                else:
+                    attention_mask = make_padding_masks(x, wav_len=torch.tensor([1]).to(x.device)).to(x.device)
+                for idx in range(len(x)):
+                    input_vals = self.processor(x[idx], sampling_rate=16_000, return_tensors="pt", padding=True)
+                    signal.append(input_vals["input_values"][0].to(x.device))
+                signal = torch.stack(signal)
+        if length is not None:
+            length = self.get_feat_extract_output_lengths(length.detach().cpu()).cuda()
+        if self.pretrain_model == "wavlm":
+            x = self.backbone_model(x, output_hidden_states=True).hidden_states
+        else:
+            x = self.backbone_model(signal, attention_mask=attention_mask, output_hidden_states=True).hidden_states
+        # Weighted sum of layers
+        stacked_feature = torch.stack(x, dim=0) if self.use_conv_output else torch.stack(x, dim=0)[1:]
+        _, *origin_shape = stacked_feature.shape
+        stacked_feature = stacked_feature.view(stacked_feature.shape[0], -1)
+        norm_weights = F.softmax(self.weights, dim=-1)
+        weighted_feature = (norm_weights.unsqueeze(-1) * stacked_feature).sum(dim=0)
+        features = weighted_feature.view(*origin_shape)
+        # Conv projection
+        features = self.model_seq(features.transpose(1, 2)).transpose(1, 2)
+        # Pooling
+        if length is not None:
+            mean = []
+            for snt_id in range(features.shape[0]):
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+        # Predictions
+        age_pred = self.age_dist_layer(features)
+        sex_pred = self.sex_layer(features)
+        if return_feature:
+            return age_pred, sex_pred, features
+        return age_pred, sex_pred
+    # Huggingface conv output length helper
+    def get_feat_extract_output_lengths(self, input_length):
+        def _conv_out_length(input_length, kernel_size, stride):
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.backbone_model.config.conv_kernel, self.backbone_model.config.conv_stride):
+            input_length = _conv_out_length(input_length, kernel_size, stride)
+        return input_length
+def age_gender(audio_waveform_np, model, device):
+    #numpy2tensor
+    if isinstance(audio_waveform_np, np.ndarray):
+        tensor = torch.from_numpy(audio_waveform_np)
+    elif isinstance(audio_waveform_np, torch.Tensor):
+        tensor = audio_waveform_np
+    if tensor.dim() == 1:
+        tensor = tensor.unsqueeze(0)
+    tensor = tensor.to(torch.device(device))
+    if tensor.dtype not in (torch.float32, torch.float16):
+        tensor = tensor.float()
+    with torch.no_grad():
+        wavlm_outputs, wavlm_sex_outputs = model(tensor)
+    age_pred = wavlm_outputs.detach().cpu().numpy().flatten() * 100.0
+    sex_prob = F.softmax(wavlm_sex_outputs, dim=1)
+    sex_labels_es = ["Femenino", "Masculino"]
+    sex_idx = int(torch.argmax(sex_prob).detach().cpu().item())
+    sex_pred = sex_labels_es[sex_idx]
+    try:
+        age_value = int(round(float(age_pred[0])))
+        if age_value < 20:
+            age_group = "joven (menor de 20)"
+        elif age_value < 35:
+            age_group = "adulto (20–35)"
+        elif age_value < 60:
+            age_group = "mediana edad (35–60)"
+        else:
+            age_group = "mayor (60+)"
+    except Exception:
+        age_value = None
+        age_group = "desconocido"
+    return str(age_value) if age_value is not None else "N/A", sex_pred, age_group

app.py CHANGED Viewed

@@ -1,40 +1,109 @@
 import gradio as gr
-from  whisper_cs_dev import generate
-from AinaTheme import theme
 import spaces
-@spaces.GPU
-def transcribe(inputs, model_version):
     if inputs is None:
-        raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer "\
-                       "o enregistreu un àudio abans d'enviar la vostra sol·licitud")
-    use_v2_fast = model_version == "v2_fast"
-    return generate(audio_path=inputs, use_v2_fast=use_v2_fast)
-description_string = "Transcripció automàtica de micròfon o de fitxers d'àudio.\n Aquest demostrador s'ha desenvolupat per"\
-              " comprovar els models de reconeixement de parla per a enregistraments estèreo de mòbils."
-def clear():
-    return None, "v2_fast"
-with gr.Blocks() as demo:
-    gr.Markdown(description_string)
-    with gr.Row():
-        with gr.Column(scale=1):
-            model_version = gr.Dropdown(label="Model Version", choices=["v2_fast", "v1.0"], value="v2_fast")
-            input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
-        with gr.Column(scale=1):
-            output = gr.Textbox(label="Output", lines=8)
-    with gr.Row(variant="panel"):
-        clear_btn = gr.Button("Clear")
-        submit_btn = gr.Button("Submit", variant="primary")
-    submit_btn.click(fn=transcribe, inputs=[input, model_version], outputs=[output])
-    clear_btn.click(fn=clear, inputs=[], outputs=[input, model_version], queue=False)
 if __name__ == "__main__":
     demo.launch()

+import os
 import gradio as gr
 import spaces
+from whisper_cs_fase_1 import generate_fase_1
+from whisper_cs_fase_2 import generate_fase_2
+from AinaTheme import theme
+@spaces.GPU()
+def transcribe_fase_1(inputs: str, model_version: str, civil_channel: str):
+    if inputs is None:
+        raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud")
+    return generate_fase_1(audio_path=inputs, model_version=model_version, civil_channel=civil_channel)
+@spaces.GPU()
+def transcribe_fase_2_display(inputs: str, model_version: str, civil_channel: str):
     if inputs is None:
+        raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud")
+    return generate_fase_2(audio_path=inputs, model_version=model_version, civil_channel=civil_channel)
+def clear_fase_1(model_version, civil_channel):
+    return None, model_version, civil_channel
+def clear_fase_2(model_version, civil_channel):
+    return None, model_version, civil_channel, "", "", "", "", "", ""
+with gr.Blocks(theme=theme) as demo:
+    gr.Markdown("## 🗣️ Transcripció automàtica d'àudio — Mode amb dues fases")
+    with gr.Tabs():
+        with gr.Tab("Fase 1"):
+            description_string = (
+                "### 🎧 Transcripció de trucades multilingüe de bona qualitat per a transcripció fiable\n"
+                "- **v2_fast**: Inclou separació de canals i inferència ràpida.\n"
+                "- **v1.0**: Inclou inferència moderada sense separació de canals."
+            )
+            gr.Markdown(description_string)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    model_version_1 = gr.Dropdown(
+                        label="Model Version",
+                        choices=["v2_fast", "v1.0"],
+                        value="v2_fast",
+                        elem_id="fase1-model-version",
+                    )
+                    civil_channel_1 = gr.Dropdown(
+                        label="Canal del Civil (persona que truca)",
+                        choices=["Left", "Right"],
+                        value="Left",
+                    )
+                    input_1 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
+                with gr.Column(scale=1):
+                    output_1 = gr.Textbox(label="Output", lines=8)
+            with gr.Row(variant="panel"):
+                clear_btn = gr.Button("Clear")
+                submit_btn = gr.Button("Submit", variant="primary")
+            submit_btn.click(fn=transcribe_fase_1, inputs=[input_1, model_version_1, civil_channel_1], outputs=[output_1])
+            clear_btn.click(fn=clear_fase_1, inputs=[model_version_1, civil_channel_1], outputs=[input_1, model_version_1, civil_channel_1], queue=False)
+        with gr.Tab("Fase 2"):
+            description_string = (
+                "### 🧠 Transcripció de trucades multilingüe de bona qualitat per a anàlisi d'informe\n"
+                "- **v2_fast_and_detection_v1**: Inclou inferència ràpida, separació de parlants i explotació de nova informació per processos analítics i informes avançats."
+            )
+            gr.Markdown(description_string)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    model_version_2 = gr.Dropdown(
+                        label="Model Version",
+                        choices=["v2_fast_and_detection_v1"],
+                        value="v2_fast_and_detection_v1",
+                        elem_id="fase2-model-version",
+                    )
+                    civil_channel_2 = gr.Dropdown(
+                        label="Canal del Civil (persona que truca)",
+                        choices=["Left", "Right"],
+                        value="Left",
+                    )
+                    input_2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
+                with gr.Column(scale=1):
+                    output_text = gr.Textbox(label="Transcripció ASR", lines=8)
+                    output_sex = gr.Textbox(label="Gènere", lines=1)
+                    output_age = gr.Textbox(label="Edat", lines=1)
+                    output_silence = gr.Textbox(label="Detecció de silenci", lines=2)
+                    output_shout = gr.Textbox(label="Detecció de crits", lines=2)
+                    output_meteo = gr.Textbox(label="Detecció d'esdeveniment meteorològic", lines=2)
+            with gr.Row(variant="panel"):
+                clear_btn2 = gr.Button("Clear")
+                submit_btn2 = gr.Button("Submit", variant="primary")
+            submit_btn2.click(
+                fn=transcribe_fase_2_display,
+                inputs=[input_2, model_version_2, civil_channel_2],
+                outputs=[output_text, output_sex, output_age, output_silence, output_shout, output_meteo]
+            )
+            clear_btn2.click(fn=clear_fase_2, inputs=[model_version_2, civil_channel_2], outputs=[input_2, model_version_2, civil_channel_2, output_text, output_sex, output_age, output_silence, output_shout, output_meteo], queue=False)
 if __name__ == "__main__":
     demo.launch()

whisper_cs_dev.py → audio_utils.py RENAMED Viewed

@@ -1,98 +1,28 @@
-from faster_whisper import WhisperModel
-from transformers import pipeline
-from pydub import AudioSegment
 import os
-import torchaudio
 import torch
-import re
-import time
-import sys
-from pathlib import Path
-import glob
-import ctypes
 import numpy as np
-from settings import DEBUG_MODE, MODEL_PATH_V2_FAST, MODEL_PATH_V1, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH, RESAMPLING_FREQ, BATCH_SIZE, TASK
-def load_cudnn():
-    if not torch.cuda.is_available():
-        if DEBUG_MODE: print("[INFO] CUDA is not available, skipping cuDNN setup.")
-        return
-    if DEBUG_MODE: print(f"[INFO] sys.platform: {sys.platform}")
-    if sys.platform == "win32":
-        torch_lib_dir = Path(torch.__file__).parent / "lib"
-        if torch_lib_dir.exists():
-            os.add_dll_directory(str(torch_lib_dir))
-            if DEBUG_MODE: print(f"[INFO] Added DLL directory: {torch_lib_dir}")
-        else:
-            if DEBUG_MODE: print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
-    elif sys.platform == "linux":
-        site_packages = Path(torch.__file__).resolve().parents[1]
-        cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib"
-        if not cudnn_dir.exists():
-            if DEBUG_MODE: print(f"[ERROR] cudnn dir not found: {cudnn_dir}")
-            return
-        pattern = str(cudnn_dir / "libcudnn_cnn*.so*")
-        matching_files = sorted(glob.glob(pattern))
-        if not matching_files:
-            if DEBUG_MODE: print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}")
-            return
-        for so_path in matching_files:
-            try:
-                ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL)
-                if DEBUG_MODE: print(f"[INFO] Loaded: {so_path}")
-            except OSError as e:
-                if DEBUG_MODE: print(f"[WARNING] Failed to load {so_path}: {e}")
-    else:
-        if DEBUG_MODE: print(f"[WARNING] sys.platform is not win32 or linux")
 def get_settings():
-    is_cuda_available = torch.cuda.is_available()
-    if is_cuda_available:
-        device = "cuda"
-        compute_type = "default"
-    else:
-        device = "cpu"
-        compute_type = "default"
     if DEBUG_MODE: print(f"[SETTINGS] Device: {device}")
     return device, compute_type
-def load_model(use_v2_fast, device, compute_type):
-    if DEBUG_MODE:
-        print(f"[MODEL LOADING] use_v2_fast: {use_v2_fast}")
-    if use_v2_fast:
-        model = WhisperModel(
-            MODEL_PATH_V2_FAST,
-            device = device,
-            compute_type = compute_type,
-        )
-    else:
-        model = pipeline(
-            task="automatic-speech-recognition",
-            model=MODEL_PATH_V1,
-            chunk_length_s=30,
-            device=device,
-            token=os.getenv("HF_TOKEN")
-            )
-    return model
 def split_input_stereo_channels(audio_path):
     ext = os.path.splitext(audio_path)[1].lower()
@@ -109,8 +39,8 @@ def split_input_stereo_channels(audio_path):
     if len(channels) != 2:
         raise ValueError(f"[FORMAT AUDIO] Audio {audio_path} has {len(channels)} channels (instead of 2).")
-    channels[0].export(RIGHT_CHANNEL_TEMP_PATH, format="wav")  # Right
-    channels[1].export(LEFT_CHANNEL_TEMP_PATH, format="wav")  # Left
 def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
@@ -127,11 +57,10 @@ def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
     return audio_np_dtype
 def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
     input_audio, sample_rate = torchaudio.load(audio_path)
     if input_audio.shape[0] == 2:
         input_audio = torch.mean(input_audio, dim=0, keepdim=True)
@@ -148,7 +77,6 @@ def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
     return input_audio
 def process_waveforms(device: str, compute_type: str):
     left_waveform  = format_audio(LEFT_CHANNEL_TEMP_PATH, compute_type, device)
@@ -157,23 +85,42 @@ def process_waveforms(device: str, compute_type: str):
     return left_waveform, right_waveform
-def transcribe_pipeline(audio, model):
-    text = model(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": TASK}, return_timestamps=True)["text"]
-    return text
-def transcribe_channels(left_waveform, right_waveform, model):
-    left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
-    right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
-    left_result = list(left_result)
-    right_result = list(right_result)
-    return left_result, right_result
-# TODO refactor and rename this function
 def post_process_transcription(transcription, max_repeats=2):
     tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
@@ -226,70 +173,15 @@ def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
     return merged_transcription.strip()
-def get_segments(result, speaker_label):
-    segments = result
-    final_segments = [
-        (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
-        for seg in segments if seg.text
-    ]
-    return final_segments
-def post_process_transcripts(left_result, right_result):
-    left_segs = get_segments(left_result, "Speaker 1")
-    right_segs = get_segments(right_result, "Speaker 2")
-    merged_transcript = sorted(
-        left_segs + right_segs,
-        key=lambda x: float(x[0]) if x[0] is not None else float("inf")
-    )
-    clean_output = ""
-    for start, end, speaker, text in merged_transcript:
-        clean_output += f"[{speaker}]: {text}\n"
-    clean_output = clean_output.strip()
-    return clean_output
 def cleanup_temp_files(*file_paths):
     for path in file_paths:
         if path and os.path.exists(path):
-            if DEBUG_MODE: print(f"Removing path: {path}")
             os.remove(path)
-def generate(audio_path, use_v2_fast):
-    load_cudnn()
-    device, requested_compute_type = get_settings()
-    model = load_model(use_v2_fast, device, requested_compute_type)
-    if use_v2_fast:
-        actual_compute_type = model.model.compute_type
-    else:
-        actual_compute_type = "float32" #HF pipeline safe default
-    if DEBUG_MODE:
-        print(f"[SETTINGS] Requested compute_type: {requested_compute_type}")
-        print(f"[SETTINGS] Actual compute_type: {actual_compute_type}")
-    if use_v2_fast:
-        split_input_stereo_channels(audio_path)
-        left_waveform, right_waveform = process_waveforms(device, actual_compute_type)
-        left_result, right_result = transcribe_channels(left_waveform, right_waveform, model)
-        output = post_process_transcripts(left_result, right_result)
-        cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH)
-    else:
-        audio = format_audio(audio_path, actual_compute_type, device)
-        merged_results = transcribe_pipeline(audio, model)
-        output = post_process_transcription(merged_results)
-    return output

 import os
 import torch
+import torchaudio
 import numpy as np
+import re
+from pydub import AudioSegment
+from settings import DEBUG_MODE, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH, RESAMPLING_FREQ
+import soundfile as sf
+# ------------------ DEBUG UTILITIES ------------------
+def debug_print(*args, **kwargs):
+    if DEBUG_MODE:
+        print(*args, **kwargs)
+# ------------------ Device Settings ------------------
 def get_settings():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    compute_type = "default"
     if DEBUG_MODE: print(f"[SETTINGS] Device: {device}")
     return device, compute_type
+# ------------------ Audio Utilities ------------------
 def split_input_stereo_channels(audio_path):
     ext = os.path.splitext(audio_path)[1].lower()
     if len(channels) != 2:
         raise ValueError(f"[FORMAT AUDIO] Audio {audio_path} has {len(channels)} channels (instead of 2).")
+    channels[0].export(LEFT_CHANNEL_TEMP_PATH, format="wav")
+    channels[1].export(RIGHT_CHANNEL_TEMP_PATH, format="wav")
 def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
     return audio_np_dtype
 def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
     input_audio, sample_rate = torchaudio.load(audio_path)
     if input_audio.shape[0] == 2:
         input_audio = torch.mean(input_audio, dim=0, keepdim=True)
     return input_audio
 def process_waveforms(device: str, compute_type: str):
     left_waveform  = format_audio(LEFT_CHANNEL_TEMP_PATH, compute_type, device)
     return left_waveform, right_waveform
+# ------------------ Post-processing ------------------
+def get_segments(result, speaker_label):
+    segments = result
+    final_segments = [
+        (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
+        for seg in segments if seg.text
+    ]
+    return final_segments
+def post_process_transcripts(left_result, right_result, civil_channel):
+    if civil_channel == "Left":
+        civil_segs = get_segments(left_result, "Civil")
+        operador_segs = get_segments(right_result, "Operador")
+    else:
+        civil_segs = get_segments(right_result, "Civil")
+        operador_segs = get_segments(left_result, "Operador")
+    merged_transcript = sorted(
+        operador_segs + civil_segs,
+        key=lambda x: float(x[0]) if x[0] is not None else float("inf")
+    )
+    clean_output_asr = ""
+    clean_output_meteo = ""
+    for start, end, speaker, text in merged_transcript:
+        clean_output_asr += f"[{speaker}]: {text}\n"
+        clean_output_meteo += f"{text}"
+    clean_output_asr = clean_output_asr.strip()
+    clean_output_meteo = clean_output_meteo.strip()
+    return clean_output_asr, clean_output_meteo
 def post_process_transcription(transcription, max_repeats=2):
     tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
     return merged_transcription.strip()
 def cleanup_temp_files(*file_paths):
     for path in file_paths:
         if path and os.path.exists(path):
             os.remove(path)
+def sec_to_hhmmss(seconds):
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    return f"{h:02d}:{m:02d}:{s:02d}"

meteo_detector.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def classify_meteo_event(text, model, threshold=0.0):
+    result = model(text, truncation=True, max_length=512)[0]
+    label = result[0]["label"]
+    score = result[0]["score"]
+    if label != "none" and round(score, 2) <= threshold:
+        label = "none"
+    event = label
+    return event

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 torch
 torchaudio
-transformers==4.55.0
 ctranslate2==4.6.0
 faster_whisper==1.2.0
 hf_transfer==0.1.9
@@ -13,4 +13,4 @@ aina-gradio-theme==2.3
 spaces==0.39.0
 peft==0.11.1
 whisper_timestamped==1.15.8
-typing==3.7.4.3

 torch
 torchaudio
+transformers==4.40.2 #gated models
 ctranslate2==4.6.0
 faster_whisper==1.2.0
 hf_transfer==0.1.9
 spaces==0.39.0
 peft==0.11.1
 whisper_timestamped==1.15.8
+typing==3.7.4.3

requirements_dev.txt DELETED Viewed

@@ -1,171 +0,0 @@
-accelerate==1.10.0
-aina-gradio-theme==2.3
-aiofiles==24.1.0
-aiohappyeyeballs==2.6.1
-aiohttp==3.12.15
-aiosignal==1.4.0
-alembic==1.16.4
-annotated-types==0.7.0
-antlr4-python3-runtime==4.9.3
-anyio==4.10.0
-asteroid-filterbanks==0.4.0
-async-timeout==5.0.1
-attrs==25.3.0
-audioread==3.0.1
-av==15.0.0
-Brotli==1.1.0
-certifi==2025.8.3
-cffi==1.17.1
-charset-normalizer==3.4.2
-click==8.2.1
-coloredlogs==15.0.1
-colorlog==6.9.0
-contourpy==1.3.2
-ctranslate2==4.6.0
-cycler==0.12.1
-Cython==3.1.2
-decorator==5.2.1
-docopt==0.6.2
-dtw-python==1.5.3
-einops==0.8.1
-exceptiongroup==1.3.0
-fastapi==0.116.1
-faster-whisper==1.2.0
-ffmpeg-python==0.2.0
-ffmpy==0.6.1
-filelock==3.18.0
-flatbuffers==25.2.10
-fonttools==4.59.0
-frozenlist==1.7.0
-fsspec==2025.7.0
-future==1.0.0
-gradio==5.41.1
-gradio_client==1.11.0
-greenlet==3.2.3
-groovy==0.1.2
-h11==0.16.0
-hf-xet==1.1.7
-hf_transfer==0.1.9
-httpcore==1.0.9
-httpx==0.28.1
-huggingface-hub==0.34.3
-humanfriendly==10.0
-HyperPyYAML==1.2.2
-idna==3.10
-Jinja2==3.1.6
-joblib==1.5.1
-julius==0.2.7
-kiwisolver==1.4.8
-lazy_loader==0.4
-librosa==0.10.1
-lightning==2.5.2
-lightning-utilities==0.15.2
-llvmlite==0.44.0
-Mako==1.3.10
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-matplotlib==3.10.5
-mdurl==0.1.2
-more-itertools==10.7.0
-mpmath==1.3.0
-msgpack==1.1.1
-multidict==6.6.3
-networkx==3.4.2
-numba==0.61.2
-numpy==2.2.6
-nvidia-cublas-cu12==12.8.4.1
-nvidia-cuda-cupti-cu12==12.8.90
-nvidia-cuda-nvrtc-cu12==12.8.93
-nvidia-cuda-runtime-cu12==12.8.90
-nvidia-cudnn-cu12==9.10.2.21
-nvidia-cufft-cu12==11.3.3.83
-nvidia-cufile-cu12==1.13.1.3
-nvidia-curand-cu12==10.3.9.90
-nvidia-cusolver-cu12==11.7.3.90
-nvidia-cusparse-cu12==12.5.8.93
-nvidia-cusparselt-cu12==0.7.1
-nvidia-nccl-cu12==2.27.3
-nvidia-nvjitlink-cu12==12.8.93
-nvidia-nvtx-cu12==12.8.90
-omegaconf==2.3.0
-onnxruntime==1.22.1
-openai-whisper==20250625
-optuna==4.4.0
-orjson==3.11.1
-packaging==25.0
-pandas==2.3.1
-peft==0.11.1
-pillow==11.3.0
-platformdirs==4.3.8
-pooch==1.8.2
-primePy==1.3
-propcache==0.3.2
-protobuf==6.31.1
-psutil==5.9.8
-pyannote.audio==3.3.2
-pyannote.core==5.0.0
-pyannote.database==5.1.3
-pyannote.metrics==3.2.1
-pyannote.pipeline==3.0.1
-pycparser==2.22
-pydantic==2.11.7
-pydantic_core==2.33.2
-pydub==0.25.1
-Pygments==2.19.2
-pyparsing==3.2.3
-python-dateutil==2.9.0.post0
-python-multipart==0.0.20
-pytorch-lightning==2.5.2
-pytorch-metric-learning==2.8.1
-pytz==2025.2
-PyYAML==6.0.2
-regex==2025.7.34
-requests==2.32.4
-rich==14.1.0
-ruamel.yaml==0.18.14
-ruamel.yaml.clib==0.2.12
-ruff==0.12.7
-safehttpx==0.1.6
-safetensors==0.6.1
-scikit-learn==1.7.1
-scipy==1.15.3
-semantic-version==2.10.0
-semver==3.0.4
-sentencepiece==0.2.0
-shellingham==1.5.4
-six==1.17.0
-sniffio==1.3.1
-sortedcontainers==2.4.0
-soundfile==0.13.1
-soxr==0.5.0.post1
-spaces==0.39.0
-speechbrain==1.0.3
-SQLAlchemy==2.0.42
-starlette==0.47.2
-sympy==1.14.0
-tabulate==0.9.0
-tensorboardX==2.6.4
-threadpoolctl==3.6.0
-tiktoken==0.10.0
-tokenizers==0.21.4
-tomli==2.2.1
-tomlkit==0.13.3
-torch==2.8.0
-torch-audiomentations==0.12.0
-torch_pitch_shift==1.2.5
-torchaudio==2.8.0
-torchmetrics==1.8.0
-tqdm==4.67.1
-transformers==4.55.0
-triton==3.4.0
-typer==0.16.0
-typing==3.7.4.3
-typing-inspection==0.4.1
-typing_extensions==4.14.1
-tzdata==2025.2
-urllib3==2.5.0
-uvicorn==0.35.0
-websockets==15.0.1
-whisper-timestamped==1.15.8
-yarl==1.20.1
-yt-dlp==2025.7.21

settings.py CHANGED Viewed

@@ -1,8 +1,13 @@
 DEBUG_MODE = True
 MODEL_PATH_V1 = "projecte-aina/whisper-large-v3-tiny-caesar"
 MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
 LEFT_CHANNEL_TEMP_PATH = "temp_mono_speaker2.wav"
 RIGHT_CHANNEL_TEMP_PATH = "temp_mono_speaker1.wav"
 RESAMPLING_FREQ = 16000
 BATCH_SIZE = 1
 TASK = "transcribe"

 DEBUG_MODE = True
 MODEL_PATH_V1 = "projecte-aina/whisper-large-v3-tiny-caesar"
 MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
+MODEL_PATH_AGE_GENDER = "tiantiaf/wavlm-large-age-sex"
+MODEL_PATH_METEO = "jayebaku/XLMRoberta-twitter-crexdata-flood-wildfire-detector"
 LEFT_CHANNEL_TEMP_PATH = "temp_mono_speaker2.wav"
 RIGHT_CHANNEL_TEMP_PATH = "temp_mono_speaker1.wav"
 RESAMPLING_FREQ = 16000
+ORIGINAL_FREQ = 8000
+MIN_SIL_DURATION = 3.0
+SIL_THRESHOLD = -35
 BATCH_SIZE = 1
 TASK = "transcribe"

shout_detector.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import librosa
+from scipy.signal import butter, sosfilt
+import numpy as np
+from settings import DEBUG_MODE, RESAMPLING_FREQ
+from audio_utils import sec_to_hhmmss
+def bandpass_filter(audio_path, RESAMPLING_FREQ, low=300, high=3400):
+    sos = butter(4, [low / (RESAMPLING_FREQ / 2), high / (RESAMPLING_FREQ / 2)], btype="band", output="sos")
+    return sosfilt(sos, audio_path)
+def extract_features(audio_path, RESAMPLING_FREQ, frame=0.05):
+    hop = int(RESAMPLING_FREQ * frame)
+    rms = librosa.feature.rms(y=audio_path, hop_length=hop)[0]
+    flux = librosa.onset.onset_strength(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)
+    rolloff = librosa.feature.spectral_rolloff(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)[0]
+    harmonic = librosa.effects.harmonic(audio_path)
+    percussive = audio_path - harmonic
+    hnr = librosa.feature.rms(y=harmonic, hop_length=hop)[0] / (librosa.feature.rms(y=percussive, hop_length=hop)[0] + 1e-6)
+    times = librosa.frames_to_time(np.arange(len(rms)), sr=RESAMPLING_FREQ, hop_length=hop)
+    return rms, flux, rolloff, hnr, times
+def compute_intensity(rms, flux, rolloff, hnr):
+    rms_w, flux_w, roll_w, hnr_w = 3.0, 1.3, 1.0, 0.8
+    r = (rms - np.mean(rms[:30])) / (np.std(rms[:30]) + 1e-5)
+    f = flux / (np.percentile(flux, 90) + 1e-6)
+    ro = rolloff / np.max(rolloff)
+    hn = hnr / np.max(hnr)
+    intensity = (
+        rms_w * np.clip(r, 0, None)
+        + flux_w * f
+        + roll_w * ro
+        + hnr_w * (1 - hn)
+    )
+    intensity = np.maximum(intensity, 0)
+    intensity = librosa.util.normalize(intensity)
+    return intensity
+def segment_intensity(times, intensity, thr=0.25):
+    ema_alpha = 0.45
+    hangover = int(0.15 / (times[1] - times[0]))
+    smooth = np.copy(intensity)
+    for i in range(1, len(intensity)):
+        smooth[i] = ema_alpha * intensity[i] + (1 - ema_alpha) * smooth[i - 1]
+    on_thr, off_thr = thr, thr * 0.6
+    active = False
+    counter = 0
+    events = []
+    start = None
+    for i, val in enumerate(smooth):
+        if not active and val >= on_thr:
+            active = True
+            start = times[i]
+        if active and val >= off_thr:
+            counter = hangover
+        elif active:
+            counter -= 1
+            if counter <= 0:
+                active = False
+                events.append((start, times[i]))
+                start = None
+    if active and start is not None:
+        events.append((start, times[-1]))
+    return events, smooth
+def assign_levels(events, intensity, times):
+    results = []
+    for st, en in events:
+        mask = (times >= st) & (times <= en)
+        if np.sum(mask) == 0:
+            continue
+        med = np.median(intensity[mask])
+        max_val = np.max(intensity[mask])
+        if med > 0.8:
+            lvl = "4 gritando"
+        elif med > 0.6:
+            lvl = "3 elevado"
+        elif med > 0.4:
+            lvl = "2 intermedio"
+        else:
+            lvl = "1 bajo"
+        results.append((st, en, lvl, med, max_val))
+    return results
+def merge_adjacent_segments(results, gap_threshold=0.3):
+    if not results:
+        return []
+    merged = []
+    cur_st, cur_en, cur_lvl, cur_med, cur_max = results[0]
+    for st, en, lvl, med, mx in results[1:]:
+        if lvl == cur_lvl and st - cur_en <= gap_threshold:
+            cur_en = en
+            cur_med = (cur_med + med) / 2
+            cur_max = max(cur_max, mx)
+        else:
+            merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
+            cur_st, cur_en, cur_lvl, cur_med, cur_max = st, en, lvl, med, mx
+    merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
+    return merged
+def shout(audio_path):
+    if DEBUG_MODE:
+        print(f"[MODEL LOADING] Loading shout model")
+    y, sr = librosa.load(audio_path, sr=RESAMPLING_FREQ, mono=True)
+    y = bandpass_filter(y, sr)
+    rms, flux, rolloff, hnr, times = extract_features(y, sr)
+    intensity = compute_intensity(rms, flux, rolloff, hnr)
+    events, _ = segment_intensity(times, intensity, thr=0.18)
+    results = assign_levels(events, intensity, times)
+    results = merge_adjacent_segments(results, gap_threshold=1)
+    results = [
+        (st, en, lvl, med, max_val)
+        for st, en, lvl, med, max_val in results
+        if "elevado" in lvl or "gritando" in lvl
+    ]
+    formatted = []
+    for st, en, lvl, med, max_val in results:
+        formatted.append(f"{sec_to_hhmmss(st)} – {sec_to_hhmmss(en)} | volumen de voz: {lvl}")
+    if not formatted:
+        return "No se detectaron gritos o voces elevadas"
+    return "\n".join(formatted)

silence_detector.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import librosa
+import numpy as np
+from settings import DEBUG_MODE, RESAMPLING_FREQ, ORIGINAL_FREQ, MIN_SIL_DURATION, SIL_THRESHOLD
+from audio_utils import sec_to_hhmmss
+def silence(audio_path):
+    if DEBUG_MODE:
+        print(f"[MODEL LOADING] Loading silence model")
+    y, sr = librosa.load(audio_path, sr=ORIGINAL_FREQ, mono=True) #merging stereo2mono
+    y = librosa.resample(y, orig_sr=ORIGINAL_FREQ, target_sr=RESAMPLING_FREQ)
+    y = y / np.max(np.abs(y))
+    frame_length = int(0.1 * RESAMPLING_FREQ)
+    hop_length = frame_length
+    rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
+    rms_db = librosa.amplitude_to_db(rms, ref=np.max)
+    silence_mask = rms_db < SIL_THRESHOLD
+    frame_duration = hop_length / RESAMPLING_FREQ
+    silence_segments = []
+    start = None
+    for i, silent in enumerate(silence_mask):
+        if silent and start is None:
+            start = i * frame_duration
+        elif not silent and start is not None:
+            end = i * frame_duration
+            if end - start >= MIN_SIL_DURATION:
+                silence_segments.append((start, end))
+            start = None
+    if start is not None:
+        end = len(silence_mask) * frame_duration
+        if end - start >= MIN_SIL_DURATION:
+            silence_segments.append((start, end))
+    if silence_segments:
+        events = [f"{sec_to_hhmmss(s)} – {sec_to_hhmmss(e)}" for s, e in silence_segments]
+        event = "Silencios detectados en: " + ", ".join(events)
+    else:
+        event = "No se detectaron silencios prolongados"
+    return event

whisper_cs.py DELETED Viewed

@@ -1,382 +0,0 @@
-import spaces
-from pydub import AudioSegment
-import os
-import torchaudio
-import torch
-import re
-import whisper_timestamped as whisper_ts
-from typing import Dict
-from faster_whisper import WhisperModel
-device = 0 if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float32
-DEBUG_MODE = True
-MODEL_PATH_V2 = "langtech-veu/whisper-timestamped-cs"
-MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
-#DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-#print("[INFO] CUDA available:", torch.cuda.is_available())
-def clean_text(input_text):
-    remove_chars = ['.', ',', ';', ':', '¿', '?', '«', '»', '-', '¡', '!', '@',
-                    '*', '{', '}', '[', ']', '=', '/', '\\', '&', '#', '…']
-    output_text = ''.join(char if char not in remove_chars else ' ' for char in input_text)
-    return ' '.join(output_text.split()).lower()
-def split_stereo_channels(audio_path):
-    ext = os.path.splitext(audio_path)[1].lower()
-    if ext == ".wav":
-        audio = AudioSegment.from_wav(audio_path)
-    elif ext == ".mp3":
-        audio = AudioSegment.from_file(audio_path, format="mp3")
-    else:
-        raise ValueError(f"Unsupported file format: {audio_path}")
-    channels = audio.split_to_mono()
-    if len(channels) != 2:
-        raise ValueError(f"Audio {audio_path} does not have 2 channels.")
-    channels[0].export(f"temp_mono_speaker1.wav", format="wav")  # Right
-    channels[1].export(f"temp_mono_speaker2.wav", format="wav")  # Left
-def format_audio(audio_path):
-    input_audio, sample_rate = torchaudio.load(audio_path)
-    if input_audio.shape[0] == 2:
-        input_audio = torch.mean(input_audio, dim=0, keepdim=True)
-    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-    input_audio = resampler(input_audio)
-    return input_audio.squeeze(), 16000
-def post_process_transcription(transcription, max_repeats=2):
-    tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
-    cleaned_tokens = []
-    repetition_count = 0
-    previous_token = None
-    for token in tokens:
-        reduced_token = re.sub(r"(\w{1,3})(\1{2,})", "", token)
-        if reduced_token == previous_token:
-            repetition_count += 1
-            if repetition_count <= max_repeats:
-                cleaned_tokens.append(reduced_token)
-        else:
-            repetition_count = 1
-            cleaned_tokens.append(reduced_token)
-        previous_token = reduced_token
-    cleaned_transcription = " ".join(cleaned_tokens)
-    cleaned_transcription = re.sub(r'\s+', ' ', cleaned_transcription).strip()
-    return cleaned_transcription
-def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
-    segments = re.split(r'(\[SPEAKER_\d{2}\])', transcription_text)
-    merged_transcription = ''
-    current_speaker = None
-    current_segment = []
-    for i in range(1, len(segments) - 1, 2):
-        speaker_tag = segments[i]
-        text = segments[i + 1].strip()
-        speaker = re.search(r'\d{2}', speaker_tag).group()
-        if speaker == current_speaker:
-            current_segment.append(text)
-        else:
-            if current_speaker is not None:
-                merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
-            current_speaker = speaker
-            current_segment = [text]
-    if current_speaker is not None:
-        merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
-    return merged_transcription.strip()
-def cleanup_temp_files(*file_paths):
-    if DEBUG_MODE: print(f"Entered cleanup_temp_files function...")
-    if DEBUG_MODE: print(f"file_paths: {file_paths}")
-    for path in file_paths:
-        if path and os.path.exists(path):
-            if DEBUG_MODE: print(f"Removing path: {path}")
-            os.remove(path)
-    if DEBUG_MODE: print(f"Exited cleanup_temp_files function.")
-'''
-try:
-    faster_model = WhisperModel(
-        MODEL_PATH_V2_FAST,
-        device="cuda" if torch.cuda.is_available() else "cpu",
-        compute_type="float16" if torch.cuda.is_available() else "int8"
-    )
-except RuntimeError as e:
-    print(f"[WARNING] Failed to load model on GPU: {e}")
-    faster_model = WhisperModel(
-        MODEL_PATH_V2_FAST,
-        device="cpu",
-        compute_type="int8"
-    )
-'''
-#faster_model = WhisperModel(MODEL_PATH_V2_FAST, device=DEVICE, compute_type="int8")
-def load_whisper_model(model_path: str):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = whisper_ts.load_model(model_path, device=device)
-    return model
-def transcribe_audio(model, audio_path: str) -> Dict:
-    try:
-        result = whisper_ts.transcribe(
-            model,
-            audio_path,
-            beam_size=5,
-            best_of=5,
-            temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            vad=False,
-            detect_disfluencies=True,
-        )
-        words = []
-        for segment in result.get('segments', []):
-            for word in segment.get('words', []):
-                word_text = word.get('word', '').strip()
-                if word_text.startswith(' '):
-                    word_text = word_text[1:]
-                words.append({
-                    'word': word_text,
-                    'start': word.get('start', 0),
-                    'end': word.get('end', 0),
-                    'confidence': word.get('confidence', 0)
-                })
-        return {
-            'audio_path': audio_path,
-            'text': result['text'].strip(),
-            'segments': result.get('segments', []),
-            'words': words,
-            'duration': result.get('duration', 0),
-            'success': True
-        }
-    except Exception as e:
-        return {
-            'audio_path': audio_path,
-            'error': str(e),
-            'success': False
-        }
-def generate(audio_path, use_v2_fast):
-    if DEBUG_MODE: print(f"Entering generate function...")
-    if DEBUG_MODE: print(f"use_v2_fast: {use_v2_fast}")
-    faster_model = None
-    if use_v2_fast:
-        if torch.cuda.is_available():
-            try:
-                if DEBUG_MODE: print("[INFO] GPU detected. Loading model on GPU with float16...")
-                faster_model = WhisperModel(
-                    MODEL_PATH_V2_FAST,
-                    device="cuda",
-                    compute_type="float16"
-                )
-            except RuntimeError as e:
-                print(f"[WARNING] Failed to load model on GPU: {e}")
-                if DEBUG_MODE: print("[INFO] Falling back to CPU with int8...")
-                faster_model = WhisperModel(
-                    MODEL_PATH_V2_FAST,
-                    device="cpu",
-                    compute_type="int8"
-                )
-        else:
-            if DEBUG_MODE: print("[INFO] No GPU detected. Loading model on CPU with int8...")
-            faster_model = WhisperModel(
-                MODEL_PATH_V2_FAST,
-                device="cpu",
-                compute_type="int8"
-            )
-        split_stereo_channels(audio_path)
-        left_channel_path = "temp_mono_speaker2.wav"
-        right_channel_path = "temp_mono_speaker1.wav"
-        left_waveform, _ = format_audio(left_channel_path)
-        right_waveform, _ = format_audio(right_channel_path)
-        left_waveform = left_waveform.numpy().astype("float32")
-        right_waveform = right_waveform.numpy().astype("float32")
-        left_result, _ = faster_model.transcribe(left_waveform, beam_size=5, task="transcribe")
-        right_result, _ = faster_model.transcribe(right_waveform, beam_size=5, task="transcribe")
-        left_result = list(left_result)
-        right_result = list(right_result)
-        def get_faster_segments(segments, speaker_label):
-            return [
-                (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
-                for seg in segments if seg.text
-            ]
-        left_segs = get_faster_segments(left_result, "Speaker 1")
-        right_segs = get_faster_segments(right_result, "Speaker 2")
-        merged_transcript = sorted(
-            left_segs + right_segs,
-            key=lambda x: float(x[0]) if x[0] is not None else float("inf")
-        )
-        clean_output = ""
-        for start, end, speaker, text in merged_transcript:
-            clean_output += f"[{speaker}]: {text}\n"
-        if DEBUG_MODE: print(f"clean_output: {clean_output}")
-    else:
-        model = load_whisper_model(MODEL_PATH_V2)
-        split_stereo_channels(audio_path)
-        left_channel_path = "temp_mono_speaker2.wav"
-        right_channel_path = "temp_mono_speaker1.wav"
-        left_waveform, _ = format_audio(left_channel_path)
-        right_waveform, _ = format_audio(right_channel_path)
-        left_result = transcribe_audio(model, left_waveform)
-        right_result = transcribe_audio(model, right_waveform)
-        def get_segments(result, speaker_label):
-            segments = result.get("segments", [])
-            if not segments:
-                return []
-            return [
-                (seg.get("start", 0.0), seg.get("end", 0.0), speaker_label,
-                 post_process_transcription(seg.get("text", "").strip()))
-                for seg in segments if seg.get("text")
-            ]
-        left_segs = get_segments(left_result, "Speaker 1")
-        right_segs = get_segments(right_result, "Speaker 2")
-        merged_transcript = sorted(
-            left_segs + right_segs,
-            key=lambda x: float(x[0]) if x[0] is not None else float("inf")
-        )
-        clean_output = ""
-        for start, end, speaker, text in merged_transcript:
-            clean_output += f"[{speaker}]: {text}\n"
-    cleanup_temp_files("temp_mono_speaker1.wav", "temp_mono_speaker2.wav")
-    if DEBUG_MODE: print(f"Exiting generate function...")
-    return clean_output.strip()
-'''
-def generate(audio_path, use_v2_fast):
-    if DEBUG_MODE: print(f"Entering generate function...")
-    if DEBUG_MODE: print(f"use_v2_fast: {use_v2_fast}")
-    if use_v2_fast:
-        split_stereo_channels(audio_path)
-        left_channel_path = "temp_mono_speaker2.wav"
-        right_channel_path = "temp_mono_speaker1.wav"
-        left_waveform, left_sr = format_audio(left_channel_path)
-        right_waveform, right_sr = format_audio(right_channel_path)
-        left_waveform = left_waveform.numpy().astype("float32")
-        right_waveform = right_waveform.numpy().astype("float32")
-        left_result, info = faster_model.transcribe(left_waveform, beam_size=5, task="transcribe")
-        right_result, info = faster_model.transcribe(right_waveform, beam_size=5, task="transcribe")
-        left_result = list(left_result)
-        right_result = list(right_result)
-        def get_faster_segments(segments, speaker_label):
-            return [
-                (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
-                for seg in segments if seg.text
-            ]
-        left_segs = get_faster_segments(left_result, "Speaker 1")
-        right_segs = get_faster_segments(right_result, "Speaker 2")
-        merged_transcript = sorted(
-            left_segs + right_segs,
-            key=lambda x: float(x[0]) if x[0] is not None else float("inf")
-        )
-        clean_output = ""
-        for start, end, speaker, text in merged_transcript:
-            clean_output += f"[{speaker}]: {text}\n"
-        # FIX Seems that post_merge_consecutive_segments_from_text returns an empty string
-        #clean_output = post_merge_consecutive_segments_from_text(clean_output)
-        #print('clean_output',clean_output)
-        if DEBUG_MODE: print(f"clean_output: {clean_output}")
-    else:
-        model = load_whisper_model(MODEL_PATH_V2)
-        split_stereo_channels(audio_path)
-        left_channel_path = "temp_mono_speaker2.wav"
-        right_channel_path = "temp_mono_speaker1.wav"
-        left_waveform, left_sr = format_audio(left_channel_path)
-        right_waveform, right_sr = format_audio(right_channel_path)
-        left_result = transcribe_audio(model, left_waveform)
-        right_result = transcribe_audio(model, right_waveform)
-        def get_segments(result, speaker_label):
-            segments = result.get("segments", [])
-            if not segments:
-                return []
-            return [
-                (seg.get("start", 0.0), seg.get("end", 0.0), speaker_label, post_process_transcription(seg.get("text", "").strip()))
-                for seg in segments if seg.get("text")
-            ]
-        left_segs = get_segments(left_result, "Speaker 1")
-        right_segs = get_segments(right_result, "Speaker 2")
-        merged_transcript = sorted(
-            left_segs + right_segs,
-            key=lambda x: float(x[0]) if x[0] is not None else float("inf")
-        )
-        output = ""
-        for start, end, speaker, text in merged_transcript:
-            output += f"[{speaker}]: {text}\n"
-        clean_output = output.strip()
-    if DEBUG_MODE: print(f"Clean output generated.")
-    cleanup_temp_files(
-        "temp_mono_speaker1.wav",
-        "temp_mono_speaker2.wav"
-    )
-    if DEBUG_MODE: print(f"Exiting generate function...")
-    return clean_output
-'''

whisper_cs_fase_1.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from faster_whisper import WhisperModel
+from transformers import pipeline
+import os
+import time
+from settings import MODEL_PATH_V2_FAST, MODEL_PATH_V1, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH, BATCH_SIZE, TASK
+from audio_utils import debug_print, get_settings, split_input_stereo_channels, format_audio, process_waveforms, post_process_transcripts, post_process_transcription, post_merge_consecutive_segments_from_text, cleanup_temp_files
+hf_token = os.getenv("HF_TOKEN")
+ASR_MODEL_V2 = None
+ASR_MODEL_V1 = None
+def get_asr_model_v2(DEVICE, COMPUTE_TYPE):
+    global ASR_MODEL_V2
+    if ASR_MODEL_V2 is None:
+        debug_print("[MODEL LOADING] Loading ASR v2_fast model...")
+        ASR_MODEL_V2 = WhisperModel(
+            MODEL_PATH_V2_FAST,
+            device=DEVICE,
+            compute_type=COMPUTE_TYPE
+        )
+        debug_print("[MODEL LOADING]v2_fast model loaded")
+    return ASR_MODEL_V2
+def get_asr_model_v1(DEVICE):
+    global ASR_MODEL_V1
+    if ASR_MODEL_V1 is None:
+        debug_print("[MODEL LOADING]Loading ASR v1 pipeline model...")
+        ASR_MODEL_V1 = pipeline(
+            task="automatic-speech-recognition",
+            model=MODEL_PATH_V1,
+            chunk_length_s=30,
+            device=0 if DEVICE == "cuda" else -1,
+            token=hf_token
+        )
+        debug_print("[MODEL LOADING]ASR v1 model loaded")
+    return ASR_MODEL_V1
+def transcribe_asr(audio, model):
+    text = model(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": TASK}, return_timestamps=True)["text"]
+    return text
+def transcribe_faster_asr(left_waveform, right_waveform, model):
+    left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
+    right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
+    return list(left_result), list(right_result)
+def generate_fase_1(audio_path, model_version, civil_channel):
+    DEVICE, COMPUTE_TYPE = get_settings()
+    debug_print(f"[Fase1] Starting inference with model version: {model_version}")
+    if model_version == "v2_fast":
+        asr_model = get_asr_model_v2(DEVICE, COMPUTE_TYPE)
+        actual_compute_type = asr_model.model.compute_type
+        debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}")
+        split_input_stereo_channels(audio_path)
+        left_waveform, right_waveform = process_waveforms(DEVICE, actual_compute_type)
+        debug_print(f"[SETTINGS] Civil channel: {civil_channel}")
+        left_result, right_result = transcribe_faster_asr(left_waveform, right_waveform, asr_model)
+        text, _ = post_process_transcripts(left_result, right_result, civil_channel)
+        cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH)
+    else:
+        actual_compute_type = "float32"  # HF pipeline safe default
+        debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}")
+        asr_model = get_asr_model_v1(DEVICE)
+        audio = format_audio(audio_path, actual_compute_type, DEVICE)
+        result = transcribe_asr(audio, asr_model)
+        text = post_process_transcription(result)
+    return text

whisper_cs_fase_2.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from faster_whisper import WhisperModel
+from transformers import pipeline
+import os
+from settings import MODEL_PATH_AGE_GENDER, MODEL_PATH_METEO, MODEL_PATH_V2_FAST, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH
+from audio_utils import debug_print, get_settings, split_input_stereo_channels, process_waveforms, post_process_transcripts, post_merge_consecutive_segments_from_text, cleanup_temp_files
+from shout_detector import shout
+from silence_detector import silence
+from meteo_detector import classify_meteo_event
+from age_gender_detector import age_gender, WavLMWrapper
+hf_token = os.getenv("HF_AUTH_TOKEN")
+ASR_MODEL = None
+AGE_GENDER_MODEL = None
+METEO_MODEL = None
+def get_asr_model(DEVICE, COMPUTE_TYPE):
+    global ASR_MODEL
+    if ASR_MODEL is None:
+        debug_print("[MODEL LOADING]Loading ASR model...")
+        ASR_MODEL = WhisperModel(
+            MODEL_PATH_V2_FAST,
+            device=DEVICE,
+            compute_type=COMPUTE_TYPE
+        )
+        debug_print("[MODEL LOADING]ASR model loaded")
+    return ASR_MODEL
+def get_age_gender_model(DEVICE):
+    global AGE_GENDER_MODEL
+    if AGE_GENDER_MODEL is None:
+        debug_print("[MODEL LOADING]Loading Age/Gender model...")
+        AGE_GENDER_MODEL = WavLMWrapper.from_pretrained(MODEL_PATH_AGE_GENDER).to(DEVICE)
+        AGE_GENDER_MODEL.eval()
+        debug_print("[MODEL LOADING]Age/Gender model loaded")
+    return AGE_GENDER_MODEL
+def get_meteo_model(DEVICE):
+    global METEO_MODEL
+    if METEO_MODEL is None:
+        debug_print("[MODEL LOADING]Loading Meteo model...")
+        METEO_MODEL = pipeline(
+            task="text-classification",
+            model=MODEL_PATH_METEO,
+            tokenizer=MODEL_PATH_METEO,
+            top_k=None,
+            device=0 if DEVICE == "cuda" else -1,
+            token=hf_token
+        )
+        debug_print("[MODEL LOADING]Meteo model loaded")
+    return METEO_MODEL
+def transcribe_faster_asr(left_waveform, right_waveform, model):
+    left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
+    right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
+    return list(left_result), list(right_result)
+def generate_fase_2(audio_path, model_version, civil_channel):
+    DEVICE, COMPUTE_TYPE = get_settings()
+    asr_model = get_asr_model(DEVICE, COMPUTE_TYPE)
+    age_gender_model = get_age_gender_model(DEVICE)
+    meteo_model = get_meteo_model(DEVICE)
+    actual_compute_type = asr_model.model.compute_type
+    debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}")
+    split_input_stereo_channels(audio_path)
+    left_waveform, right_waveform = process_waveforms(DEVICE, actual_compute_type)
+    debug_print(f"[SETTINGS] Civil channel: {civil_channel}")
+    left_result, right_result = transcribe_faster_asr(left_waveform, right_waveform, asr_model)
+    silence_event = silence(audio_path)
+    civil_waveform = left_waveform if civil_channel == "Left" else right_waveform
+    civil_path = LEFT_CHANNEL_TEMP_PATH if civil_channel == "Left" else RIGHT_CHANNEL_TEMP_PATH
+    shout_event = shout(civil_path)
+    age, sex, age_group = age_gender(civil_waveform, age_gender_model, DEVICE)
+    age = f"{age_group} (aprox. {age} años)"
+    clean_output_asr, clean_output_meteo = post_process_transcripts(left_result, right_result, civil_channel)
+    text = '\n' + clean_output_asr
+    meteo_event = classify_meteo_event(clean_output_meteo, meteo_model, threshold=0.0)
+    cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH)
+    return text, sex, age, silence_event, shout_event, meteo_event