AbirMessaoudi commited on
Commit
1619dcb
·
verified ·
1 Parent(s): 3d88604

fase_1, fase_2 releases (#46)

Browse files

- Fase_1 and Fase_2 releases, code cleaned (d6fb6a283d102ccaf8f654e51575987d4045b6d6)

README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🤫
4
  colorFrom: indigo
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.41.1
8
  app_file: app.py
9
  pinned: false
10
  tags:
@@ -89,7 +89,7 @@ Per descarregar i córrer la imatge de docker:
89
 
90
  ```
91
  docker run -d -p 7860:7860 --name asr-inference --platform=linux/amd64 \
92
- registry.hf.space/bsc-lt-asr-inference:latest python app.py
93
  ```
94
 
95
 
 
4
  colorFrom: indigo
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 4.20.0
8
  app_file: app.py
9
  pinned: false
10
  tags:
 
89
 
90
  ```
91
  docker run -d -p 7860:7860 --name asr-inference --platform=linux/amd64 \
92
+ registry.hf.space/projecte-aina-asr-inference:latest python app.py
93
  ```
94
 
95
 
age_gender_detector.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.autograd import Function
6
+ from torch import tensor
7
+ from transformers import Wav2Vec2FeatureExtractor, WavLMModel
8
+ import transformers.models.wavlm.modeling_wavlm as wavlm
9
+ from huggingface_hub import PyTorchModelHubMixin
10
+ from speechbrain.lobes.models.huggingface_transformers.huggingface import make_padding_masks
11
+
12
+
13
+ class RevGrad(Function):
14
+ @staticmethod
15
+ def forward(ctx, input_, alpha_):
16
+ ctx.save_for_backward(input_, alpha_)
17
+ return input_
18
+
19
+ @staticmethod
20
+ def backward(ctx, grad_output):
21
+ _, alpha_ = ctx.saved_tensors
22
+ grad_input = -grad_output * alpha_ if ctx.needs_input_grad[0] else None
23
+ return grad_input, None
24
+
25
+
26
+ revgrad = RevGrad.apply
27
+
28
+
29
+ class RevGradLayer(nn.Module):
30
+ def __init__(self, alpha=1.):
31
+ super().__init__()
32
+ self._alpha = tensor(alpha, requires_grad=False)
33
+
34
+ def forward(self, x):
35
+ return revgrad(x, self._alpha)
36
+
37
+
38
+ class WavLMEncoderLayer(nn.Module):
39
+ def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
40
+ super().__init__()
41
+ self.attention = wavlm.WavLMAttention(
42
+ embed_dim=config.hidden_size,
43
+ num_heads=config.num_attention_heads,
44
+ dropout=config.attention_dropout,
45
+ num_buckets=config.num_buckets,
46
+ max_distance=config.max_bucket_distance,
47
+ has_relative_position_bias=has_relative_position_bias,
48
+ )
49
+ self.dropout = nn.Dropout(config.hidden_dropout)
50
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
51
+ self.feed_forward = wavlm.WavLMFeedForward(config)
52
+ self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
53
+ self.config = config
54
+
55
+
56
+ def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
57
+ attn_residual = hidden_states
58
+ hidden_states, attn_weights, position_bias = self.attention(
59
+ hidden_states,
60
+ attention_mask=attention_mask,
61
+ position_bias=position_bias,
62
+ output_attentions=output_attentions,
63
+ index=index,
64
+ )
65
+ hidden_states = self.dropout(hidden_states)
66
+ hidden_states = attn_residual + hidden_states
67
+
68
+ hidden_states = self.layer_norm(hidden_states)
69
+ hidden_states = hidden_states + self.feed_forward(hidden_states)
70
+ hidden_states = self.final_layer_norm(hidden_states)
71
+ outputs = (hidden_states, position_bias)
72
+
73
+ if output_attentions:
74
+ outputs += (attn_weights,)
75
+
76
+ return outputs
77
+
78
+
79
+ class WavLMEncoderLayerStableLayerNorm(nn.Module):
80
+ def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
81
+ super().__init__()
82
+ self.attention = wavlm.WavLMAttention(
83
+ embed_dim=config.hidden_size,
84
+ num_heads=config.num_attention_heads,
85
+ dropout=config.attention_dropout,
86
+ num_buckets=config.num_buckets,
87
+ max_distance=config.max_bucket_distance,
88
+ has_relative_position_bias=has_relative_position_bias,
89
+ )
90
+ self.dropout = nn.Dropout(config.hidden_dropout)
91
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
92
+ self.feed_forward = wavlm.WavLMFeedForward(config)
93
+ self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
94
+ self.config = config
95
+
96
+ def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
97
+ attn_residual = hidden_states
98
+ hidden_states = self.layer_norm(hidden_states)
99
+ hidden_states, attn_weights, position_bias = self.attention(
100
+ hidden_states,
101
+ attention_mask=attention_mask,
102
+ position_bias=position_bias,
103
+ output_attentions=output_attentions,
104
+ )
105
+ hidden_states = self.dropout(hidden_states)
106
+ hidden_states = attn_residual + hidden_states
107
+ hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
108
+
109
+ outputs = (hidden_states, position_bias)
110
+
111
+ if output_attentions:
112
+ outputs += (attn_weights,)
113
+
114
+ return outputs
115
+
116
+
117
+ class WavLMWrapper(nn.Module, PyTorchModelHubMixin):
118
+
119
+ def __init__(
120
+ self,
121
+ pretrain_model="wavlm_large",
122
+ hidden_dim=256,
123
+ freeze_params=True,
124
+ output_class_num=4,
125
+ use_conv_output=True,
126
+ apply_reg=False
127
+ ):
128
+ super().__init__()
129
+ self.pretrain_model = pretrain_model
130
+ self.use_conv_output = use_conv_output
131
+
132
+ # Load backbone
133
+ if self.pretrain_model == "wavlm":
134
+ self.backbone_model = WavLMModel.from_pretrained(
135
+ "microsoft/wavlm-base-plus",
136
+ output_hidden_states=True,
137
+ )
138
+ elif self.pretrain_model == "wavlm_large":
139
+ self.processor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-large')
140
+ self.backbone_model = WavLMModel.from_pretrained(
141
+ "microsoft/wavlm-large",
142
+ output_hidden_states=True,
143
+ )
144
+
145
+ # Keep original encoder layers (no LoRA)
146
+ state_dict = self.backbone_model.state_dict()
147
+ self.model_config = self.backbone_model.config
148
+ if self.pretrain_model == "wavlm":
149
+ self.backbone_model.encoder.layers = nn.ModuleList(
150
+ [WavLMEncoderLayer(i, self.model_config, has_relative_position_bias=(i == 0))
151
+ for i in range(self.model_config.num_hidden_layers)]
152
+ )
153
+ else:
154
+ self.backbone_model.encoder.layers = nn.ModuleList(
155
+ [WavLMEncoderLayerStableLayerNorm(i, self.model_config, has_relative_position_bias=(i == 0))
156
+ for i in range(self.model_config.num_hidden_layers)]
157
+ )
158
+ self.backbone_model.load_state_dict(state_dict, strict=False)
159
+
160
+ # Freeze weights if requested
161
+ if freeze_params:
162
+ for p in self.backbone_model.parameters():
163
+ p.requires_grad = False
164
+
165
+ # Conv projection layers
166
+ self.model_seq = nn.Sequential(
167
+ nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1),
168
+ nn.ReLU(),
169
+ nn.Dropout(0.1),
170
+ nn.Conv1d(hidden_dim, hidden_dim, 1),
171
+ nn.ReLU(),
172
+ nn.Dropout(0.1),
173
+ nn.Conv1d(hidden_dim, hidden_dim, 1)
174
+ )
175
+
176
+ # Layer weights
177
+ num_layers = self.model_config.num_hidden_layers + 1 if use_conv_output else self.model_config.num_hidden_layers
178
+ self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
179
+
180
+ # Output heads
181
+ if apply_reg:
182
+ self.age_dist_layer = nn.Sequential(
183
+ nn.Linear(hidden_dim, hidden_dim),
184
+ nn.ReLU(),
185
+ nn.Linear(hidden_dim, 1),
186
+ nn.Sigmoid()
187
+ )
188
+ else:
189
+ self.age_dist_layer = nn.Sequential(
190
+ nn.Linear(hidden_dim, hidden_dim),
191
+ nn.ReLU(),
192
+ nn.Linear(hidden_dim, 7)
193
+ )
194
+
195
+ self.sex_layer = nn.Sequential(
196
+ nn.Linear(hidden_dim, hidden_dim),
197
+ nn.ReLU(),
198
+ nn.Linear(hidden_dim, 2)
199
+ )
200
+
201
+ def forward(self, x, length=None, return_feature=False, pred="age_dist_sex"):
202
+ # Feature extraction
203
+ if self.pretrain_model == "wavlm_large":
204
+ with torch.no_grad():
205
+ signal, attention_mask = [], []
206
+ if length is not None:
207
+ attention_mask = make_padding_masks(x, wav_len=length/length.max()).to(x.device)
208
+ else:
209
+ attention_mask = make_padding_masks(x, wav_len=torch.tensor([1]).to(x.device)).to(x.device)
210
+
211
+ for idx in range(len(x)):
212
+ input_vals = self.processor(x[idx], sampling_rate=16_000, return_tensors="pt", padding=True)
213
+ signal.append(input_vals["input_values"][0].to(x.device))
214
+ signal = torch.stack(signal)
215
+
216
+ if length is not None:
217
+ length = self.get_feat_extract_output_lengths(length.detach().cpu()).cuda()
218
+
219
+ if self.pretrain_model == "wavlm":
220
+ x = self.backbone_model(x, output_hidden_states=True).hidden_states
221
+ else:
222
+ x = self.backbone_model(signal, attention_mask=attention_mask, output_hidden_states=True).hidden_states
223
+
224
+ # Weighted sum of layers
225
+ stacked_feature = torch.stack(x, dim=0) if self.use_conv_output else torch.stack(x, dim=0)[1:]
226
+ _, *origin_shape = stacked_feature.shape
227
+ stacked_feature = stacked_feature.view(stacked_feature.shape[0], -1)
228
+ norm_weights = F.softmax(self.weights, dim=-1)
229
+ weighted_feature = (norm_weights.unsqueeze(-1) * stacked_feature).sum(dim=0)
230
+ features = weighted_feature.view(*origin_shape)
231
+
232
+ # Conv projection
233
+ features = self.model_seq(features.transpose(1, 2)).transpose(1, 2)
234
+
235
+ # Pooling
236
+ if length is not None:
237
+ mean = []
238
+ for snt_id in range(features.shape[0]):
239
+ actual_size = length[snt_id]
240
+ mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
241
+ features = torch.stack(mean)
242
+ else:
243
+ features = torch.mean(features, dim=1)
244
+
245
+ # Predictions
246
+ age_pred = self.age_dist_layer(features)
247
+ sex_pred = self.sex_layer(features)
248
+
249
+ if return_feature:
250
+ return age_pred, sex_pred, features
251
+ return age_pred, sex_pred
252
+
253
+ # Huggingface conv output length helper
254
+ def get_feat_extract_output_lengths(self, input_length):
255
+ def _conv_out_length(input_length, kernel_size, stride):
256
+ return (input_length - kernel_size) // stride + 1
257
+ for kernel_size, stride in zip(self.backbone_model.config.conv_kernel, self.backbone_model.config.conv_stride):
258
+ input_length = _conv_out_length(input_length, kernel_size, stride)
259
+ return input_length
260
+
261
+ def age_gender(audio_waveform_np, model, device):
262
+ #numpy2tensor
263
+ if isinstance(audio_waveform_np, np.ndarray):
264
+ tensor = torch.from_numpy(audio_waveform_np)
265
+ elif isinstance(audio_waveform_np, torch.Tensor):
266
+ tensor = audio_waveform_np
267
+
268
+ if tensor.dim() == 1:
269
+ tensor = tensor.unsqueeze(0)
270
+
271
+ tensor = tensor.to(torch.device(device))
272
+
273
+ if tensor.dtype not in (torch.float32, torch.float16):
274
+ tensor = tensor.float()
275
+
276
+ with torch.no_grad():
277
+ wavlm_outputs, wavlm_sex_outputs = model(tensor)
278
+
279
+ age_pred = wavlm_outputs.detach().cpu().numpy().flatten() * 100.0
280
+ sex_prob = F.softmax(wavlm_sex_outputs, dim=1)
281
+ sex_labels_es = ["Femenino", "Masculino"]
282
+ sex_idx = int(torch.argmax(sex_prob).detach().cpu().item())
283
+ sex_pred = sex_labels_es[sex_idx]
284
+
285
+ try:
286
+ age_value = int(round(float(age_pred[0])))
287
+ if age_value < 20:
288
+ age_group = "joven (menor de 20)"
289
+ elif age_value < 35:
290
+ age_group = "adulto (20–35)"
291
+ elif age_value < 60:
292
+ age_group = "mediana edad (35–60)"
293
+ else:
294
+ age_group = "mayor (60+)"
295
+ except Exception:
296
+ age_value = None
297
+ age_group = "desconocido"
298
+
299
+ return str(age_value) if age_value is not None else "N/A", sex_pred, age_group
app.py CHANGED
@@ -1,40 +1,109 @@
 
1
  import gradio as gr
2
- from whisper_cs_dev import generate
3
- from AinaTheme import theme
4
  import spaces
5
 
6
- @spaces.GPU
7
- def transcribe(inputs, model_version):
 
 
 
 
 
 
 
 
 
 
8
  if inputs is None:
9
- raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer "\
10
- "o enregistreu un àudio abans d'enviar la vostra sol·licitud")
 
 
 
 
 
 
 
11
 
12
- use_v2_fast = model_version == "v2_fast"
13
- return generate(audio_path=inputs, use_v2_fast=use_v2_fast)
14
 
15
- description_string = "Transcripció automàtica de micròfon o de fitxers d'àudio.\n Aquest demostrador s'ha desenvolupat per"\
16
- " comprovar els models de reconeixement de parla per a enregistraments estèreo de mòbils."
17
 
18
- def clear():
19
- return None, "v2_fast"
 
 
 
 
 
 
20
 
21
- with gr.Blocks() as demo:
22
- gr.Markdown(description_string)
23
- with gr.Row():
24
- with gr.Column(scale=1):
25
- model_version = gr.Dropdown(label="Model Version", choices=["v2_fast", "v1.0"], value="v2_fast")
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
 
 
28
 
29
- with gr.Column(scale=1):
30
- output = gr.Textbox(label="Output", lines=8)
31
 
32
- with gr.Row(variant="panel"):
33
- clear_btn = gr.Button("Clear")
34
- submit_btn = gr.Button("Submit", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- submit_btn.click(fn=transcribe, inputs=[input, model_version], outputs=[output])
37
- clear_btn.click(fn=clear, inputs=[], outputs=[input, model_version], queue=False)
38
 
39
  if __name__ == "__main__":
40
  demo.launch()
 
 
1
+ import os
2
  import gradio as gr
 
 
3
  import spaces
4
 
5
+ from whisper_cs_fase_1 import generate_fase_1
6
+ from whisper_cs_fase_2 import generate_fase_2
7
+ from AinaTheme import theme
8
+
9
+ @spaces.GPU()
10
+ def transcribe_fase_1(inputs: str, model_version: str, civil_channel: str):
11
+ if inputs is None:
12
+ raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud")
13
+ return generate_fase_1(audio_path=inputs, model_version=model_version, civil_channel=civil_channel)
14
+
15
+ @spaces.GPU()
16
+ def transcribe_fase_2_display(inputs: str, model_version: str, civil_channel: str):
17
  if inputs is None:
18
+ raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer o enregistreu un àudio abans d'enviar la vostra sol·licitud")
19
+ return generate_fase_2(audio_path=inputs, model_version=model_version, civil_channel=civil_channel)
20
+
21
+
22
+ def clear_fase_1(model_version, civil_channel):
23
+ return None, model_version, civil_channel
24
+
25
+ def clear_fase_2(model_version, civil_channel):
26
+ return None, model_version, civil_channel, "", "", "", "", "", ""
27
 
 
 
28
 
29
+ with gr.Blocks(theme=theme) as demo:
30
+ gr.Markdown("## 🗣️ Transcripció automàtica d'àudio Mode amb dues fases")
31
 
32
+ with gr.Tabs():
33
+ with gr.Tab("Fase 1"):
34
+ description_string = (
35
+ "### 🎧 Transcripció de trucades multilingüe de bona qualitat per a transcripció fiable\n"
36
+ "- **v2_fast**: Inclou separació de canals i inferència ràpida.\n"
37
+ "- **v1.0**: Inclou inferència moderada sense separació de canals."
38
+ )
39
+ gr.Markdown(description_string)
40
 
41
+ with gr.Row():
42
+ with gr.Column(scale=1):
43
+ model_version_1 = gr.Dropdown(
44
+ label="Model Version",
45
+ choices=["v2_fast", "v1.0"],
46
+ value="v2_fast",
47
+ elem_id="fase1-model-version",
48
+ )
49
+ civil_channel_1 = gr.Dropdown(
50
+ label="Canal del Civil (persona que truca)",
51
+ choices=["Left", "Right"],
52
+ value="Left",
53
+ )
54
+ input_1 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
55
+ with gr.Column(scale=1):
56
+ output_1 = gr.Textbox(label="Output", lines=8)
57
 
58
+ with gr.Row(variant="panel"):
59
+ clear_btn = gr.Button("Clear")
60
+ submit_btn = gr.Button("Submit", variant="primary")
61
 
62
+ submit_btn.click(fn=transcribe_fase_1, inputs=[input_1, model_version_1, civil_channel_1], outputs=[output_1])
63
+ clear_btn.click(fn=clear_fase_1, inputs=[model_version_1, civil_channel_1], outputs=[input_1, model_version_1, civil_channel_1], queue=False)
64
 
65
+ with gr.Tab("Fase 2"):
66
+ description_string = (
67
+ "### 🧠 Transcripció de trucades multilingüe de bona qualitat per a anàlisi d'informe\n"
68
+ "- **v2_fast_and_detection_v1**: Inclou inferència ràpida, separació de parlants i explotació de nova informació per processos analítics i informes avançats."
69
+ )
70
+ gr.Markdown(description_string)
71
+
72
+ with gr.Row():
73
+ with gr.Column(scale=1):
74
+ model_version_2 = gr.Dropdown(
75
+ label="Model Version",
76
+ choices=["v2_fast_and_detection_v1"],
77
+ value="v2_fast_and_detection_v1",
78
+ elem_id="fase2-model-version",
79
+ )
80
+ civil_channel_2 = gr.Dropdown(
81
+ label="Canal del Civil (persona que truca)",
82
+ choices=["Left", "Right"],
83
+ value="Left",
84
+ )
85
+ input_2 = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
86
+ with gr.Column(scale=1):
87
+ output_text = gr.Textbox(label="Transcripció ASR", lines=8)
88
+ output_sex = gr.Textbox(label="Gènere", lines=1)
89
+ output_age = gr.Textbox(label="Edat", lines=1)
90
+ output_silence = gr.Textbox(label="Detecció de silenci", lines=2)
91
+ output_shout = gr.Textbox(label="Detecció de crits", lines=2)
92
+ output_meteo = gr.Textbox(label="Detecció d'esdeveniment meteorològic", lines=2)
93
+
94
+ with gr.Row(variant="panel"):
95
+ clear_btn2 = gr.Button("Clear")
96
+ submit_btn2 = gr.Button("Submit", variant="primary")
97
+
98
+ submit_btn2.click(
99
+ fn=transcribe_fase_2_display,
100
+ inputs=[input_2, model_version_2, civil_channel_2],
101
+ outputs=[output_text, output_sex, output_age, output_silence, output_shout, output_meteo]
102
+ )
103
+
104
+ clear_btn2.click(fn=clear_fase_2, inputs=[model_version_2, civil_channel_2], outputs=[input_2, model_version_2, civil_channel_2, output_text, output_sex, output_age, output_silence, output_shout, output_meteo], queue=False)
105
 
 
 
106
 
107
  if __name__ == "__main__":
108
  demo.launch()
109
+
whisper_cs_dev.py → audio_utils.py RENAMED
@@ -1,98 +1,28 @@
1
- from faster_whisper import WhisperModel
2
- from transformers import pipeline
3
- from pydub import AudioSegment
4
  import os
5
- import torchaudio
6
  import torch
7
- import re
8
- import time
9
- import sys
10
- from pathlib import Path
11
- import glob
12
- import ctypes
13
  import numpy as np
 
 
 
 
14
 
15
- from settings import DEBUG_MODE, MODEL_PATH_V2_FAST, MODEL_PATH_V1, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH, RESAMPLING_FREQ, BATCH_SIZE, TASK
16
-
17
- def load_cudnn():
18
-
19
- if not torch.cuda.is_available():
20
- if DEBUG_MODE: print("[INFO] CUDA is not available, skipping cuDNN setup.")
21
- return
22
-
23
- if DEBUG_MODE: print(f"[INFO] sys.platform: {sys.platform}")
24
- if sys.platform == "win32":
25
- torch_lib_dir = Path(torch.__file__).parent / "lib"
26
- if torch_lib_dir.exists():
27
- os.add_dll_directory(str(torch_lib_dir))
28
- if DEBUG_MODE: print(f"[INFO] Added DLL directory: {torch_lib_dir}")
29
- else:
30
- if DEBUG_MODE: print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
31
-
32
- elif sys.platform == "linux":
33
- site_packages = Path(torch.__file__).resolve().parents[1]
34
- cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib"
35
-
36
- if not cudnn_dir.exists():
37
- if DEBUG_MODE: print(f"[ERROR] cudnn dir not found: {cudnn_dir}")
38
- return
39
-
40
- pattern = str(cudnn_dir / "libcudnn_cnn*.so*")
41
- matching_files = sorted(glob.glob(pattern))
42
- if not matching_files:
43
- if DEBUG_MODE: print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}")
44
- return
45
-
46
- for so_path in matching_files:
47
- try:
48
- ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL)
49
- if DEBUG_MODE: print(f"[INFO] Loaded: {so_path}")
50
- except OSError as e:
51
- if DEBUG_MODE: print(f"[WARNING] Failed to load {so_path}: {e}")
52
- else:
53
- if DEBUG_MODE: print(f"[WARNING] sys.platform is not win32 or linux")
54
-
55
 
 
56
  def get_settings():
57
 
58
- is_cuda_available = torch.cuda.is_available()
59
- if is_cuda_available:
60
- device = "cuda"
61
- compute_type = "default"
62
-
63
- else:
64
- device = "cpu"
65
- compute_type = "default"
66
 
67
  if DEBUG_MODE: print(f"[SETTINGS] Device: {device}")
68
 
69
  return device, compute_type
70
 
71
-
72
-
73
- def load_model(use_v2_fast, device, compute_type):
74
-
75
- if DEBUG_MODE:
76
- print(f"[MODEL LOADING] use_v2_fast: {use_v2_fast}")
77
-
78
- if use_v2_fast:
79
- model = WhisperModel(
80
- MODEL_PATH_V2_FAST,
81
- device = device,
82
- compute_type = compute_type,
83
- )
84
- else:
85
- model = pipeline(
86
- task="automatic-speech-recognition",
87
- model=MODEL_PATH_V1,
88
- chunk_length_s=30,
89
- device=device,
90
- token=os.getenv("HF_TOKEN")
91
- )
92
-
93
- return model
94
-
95
-
96
  def split_input_stereo_channels(audio_path):
97
 
98
  ext = os.path.splitext(audio_path)[1].lower()
@@ -109,8 +39,8 @@ def split_input_stereo_channels(audio_path):
109
  if len(channels) != 2:
110
  raise ValueError(f"[FORMAT AUDIO] Audio {audio_path} has {len(channels)} channels (instead of 2).")
111
 
112
- channels[0].export(RIGHT_CHANNEL_TEMP_PATH, format="wav") # Right
113
- channels[1].export(LEFT_CHANNEL_TEMP_PATH, format="wav") # Left
114
 
115
 
116
  def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
@@ -127,11 +57,10 @@ def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
127
 
128
  return audio_np_dtype
129
 
130
-
131
  def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
132
 
133
  input_audio, sample_rate = torchaudio.load(audio_path)
134
-
135
  if input_audio.shape[0] == 2:
136
  input_audio = torch.mean(input_audio, dim=0, keepdim=True)
137
 
@@ -148,7 +77,6 @@ def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
148
  return input_audio
149
 
150
 
151
-
152
  def process_waveforms(device: str, compute_type: str):
153
 
154
  left_waveform = format_audio(LEFT_CHANNEL_TEMP_PATH, compute_type, device)
@@ -157,23 +85,42 @@ def process_waveforms(device: str, compute_type: str):
157
  return left_waveform, right_waveform
158
 
159
 
160
- def transcribe_pipeline(audio, model):
161
- text = model(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": TASK}, return_timestamps=True)["text"]
162
- return text
163
 
 
 
 
 
 
164
 
165
- def transcribe_channels(left_waveform, right_waveform, model):
166
 
167
- left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
168
- right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
169
 
170
- left_result = list(left_result)
171
- right_result = list(right_result)
 
 
 
 
172
 
173
- return left_result, right_result
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
 
176
- # TODO refactor and rename this function
177
  def post_process_transcription(transcription, max_repeats=2):
178
 
179
  tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
@@ -226,70 +173,15 @@ def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
226
 
227
  return merged_transcription.strip()
228
 
229
-
230
- def get_segments(result, speaker_label):
231
-
232
- segments = result
233
- final_segments = [
234
- (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
235
- for seg in segments if seg.text
236
- ]
237
-
238
- return final_segments
239
-
240
-
241
- def post_process_transcripts(left_result, right_result):
242
-
243
- left_segs = get_segments(left_result, "Speaker 1")
244
- right_segs = get_segments(right_result, "Speaker 2")
245
-
246
- merged_transcript = sorted(
247
- left_segs + right_segs,
248
- key=lambda x: float(x[0]) if x[0] is not None else float("inf")
249
- )
250
-
251
- clean_output = ""
252
- for start, end, speaker, text in merged_transcript:
253
- clean_output += f"[{speaker}]: {text}\n"
254
- clean_output = clean_output.strip()
255
-
256
- return clean_output
257
-
258
-
259
  def cleanup_temp_files(*file_paths):
260
 
261
  for path in file_paths:
262
  if path and os.path.exists(path):
263
- if DEBUG_MODE: print(f"Removing path: {path}")
264
  os.remove(path)
265
 
 
 
 
 
 
266
 
267
-
268
-
269
- def generate(audio_path, use_v2_fast):
270
-
271
- load_cudnn()
272
- device, requested_compute_type = get_settings()
273
- model = load_model(use_v2_fast, device, requested_compute_type)
274
-
275
- if use_v2_fast:
276
- actual_compute_type = model.model.compute_type
277
- else:
278
- actual_compute_type = "float32" #HF pipeline safe default
279
-
280
- if DEBUG_MODE:
281
- print(f"[SETTINGS] Requested compute_type: {requested_compute_type}")
282
- print(f"[SETTINGS] Actual compute_type: {actual_compute_type}")
283
-
284
- if use_v2_fast:
285
- split_input_stereo_channels(audio_path)
286
- left_waveform, right_waveform = process_waveforms(device, actual_compute_type)
287
- left_result, right_result = transcribe_channels(left_waveform, right_waveform, model)
288
- output = post_process_transcripts(left_result, right_result)
289
- cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH)
290
- else:
291
- audio = format_audio(audio_path, actual_compute_type, device)
292
- merged_results = transcribe_pipeline(audio, model)
293
- output = post_process_transcription(merged_results)
294
-
295
- return output
 
 
 
 
1
  import os
 
2
  import torch
3
+ import torchaudio
 
 
 
 
 
4
  import numpy as np
5
+ import re
6
+ from pydub import AudioSegment
7
+ from settings import DEBUG_MODE, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH, RESAMPLING_FREQ
8
+ import soundfile as sf
9
 
10
+ # ------------------ DEBUG UTILITIES ------------------
11
+ def debug_print(*args, **kwargs):
12
+ if DEBUG_MODE:
13
+ print(*args, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # ------------------ Device Settings ------------------
16
  def get_settings():
17
 
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ compute_type = "default"
 
 
 
 
 
 
20
 
21
  if DEBUG_MODE: print(f"[SETTINGS] Device: {device}")
22
 
23
  return device, compute_type
24
 
25
+ # ------------------ Audio Utilities ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def split_input_stereo_channels(audio_path):
27
 
28
  ext = os.path.splitext(audio_path)[1].lower()
 
39
  if len(channels) != 2:
40
  raise ValueError(f"[FORMAT AUDIO] Audio {audio_path} has {len(channels)} channels (instead of 2).")
41
 
42
+ channels[0].export(LEFT_CHANNEL_TEMP_PATH, format="wav")
43
+ channels[1].export(RIGHT_CHANNEL_TEMP_PATH, format="wav")
44
 
45
 
46
  def compute_type_to_audio_dtype(compute_type: str, device: str) -> np.dtype:
 
57
 
58
  return audio_np_dtype
59
 
 
60
  def format_audio(audio_path: str, compute_type: str, device: str) -> np.ndarray:
61
 
62
  input_audio, sample_rate = torchaudio.load(audio_path)
63
+
64
  if input_audio.shape[0] == 2:
65
  input_audio = torch.mean(input_audio, dim=0, keepdim=True)
66
 
 
77
  return input_audio
78
 
79
 
 
80
  def process_waveforms(device: str, compute_type: str):
81
 
82
  left_waveform = format_audio(LEFT_CHANNEL_TEMP_PATH, compute_type, device)
 
85
  return left_waveform, right_waveform
86
 
87
 
88
+ # ------------------ Post-processing ------------------
89
+ def get_segments(result, speaker_label):
 
90
 
91
+ segments = result
92
+ final_segments = [
93
+ (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
94
+ for seg in segments if seg.text
95
+ ]
96
 
97
+ return final_segments
98
 
99
+ def post_process_transcripts(left_result, right_result, civil_channel):
 
100
 
101
+ if civil_channel == "Left":
102
+ civil_segs = get_segments(left_result, "Civil")
103
+ operador_segs = get_segments(right_result, "Operador")
104
+ else:
105
+ civil_segs = get_segments(right_result, "Civil")
106
+ operador_segs = get_segments(left_result, "Operador")
107
 
108
+ merged_transcript = sorted(
109
+ operador_segs + civil_segs,
110
+ key=lambda x: float(x[0]) if x[0] is not None else float("inf")
111
+ )
112
+
113
+ clean_output_asr = ""
114
+ clean_output_meteo = ""
115
+ for start, end, speaker, text in merged_transcript:
116
+ clean_output_asr += f"[{speaker}]: {text}\n"
117
+ clean_output_meteo += f"{text}"
118
+ clean_output_asr = clean_output_asr.strip()
119
+ clean_output_meteo = clean_output_meteo.strip()
120
+
121
+ return clean_output_asr, clean_output_meteo
122
 
123
 
 
124
  def post_process_transcription(transcription, max_repeats=2):
125
 
126
  tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
 
173
 
174
  return merged_transcription.strip()
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def cleanup_temp_files(*file_paths):
177
 
178
  for path in file_paths:
179
  if path and os.path.exists(path):
 
180
  os.remove(path)
181
 
182
+ def sec_to_hhmmss(seconds):
183
+ h = int(seconds // 3600)
184
+ m = int((seconds % 3600) // 60)
185
+ s = int(seconds % 60)
186
+ return f"{h:02d}:{m:02d}:{s:02d}"
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meteo_detector.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def classify_meteo_event(text, model, threshold=0.0):
2
+ result = model(text, truncation=True, max_length=512)[0]
3
+
4
+ label = result[0]["label"]
5
+ score = result[0]["score"]
6
+
7
+ if label != "none" and round(score, 2) <= threshold:
8
+ label = "none"
9
+
10
+ event = label
11
+
12
+ return event
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  torch
2
  torchaudio
3
- transformers==4.55.0
4
  ctranslate2==4.6.0
5
  faster_whisper==1.2.0
6
  hf_transfer==0.1.9
@@ -13,4 +13,4 @@ aina-gradio-theme==2.3
13
  spaces==0.39.0
14
  peft==0.11.1
15
  whisper_timestamped==1.15.8
16
- typing==3.7.4.3
 
1
  torch
2
  torchaudio
3
+ transformers==4.40.2 #gated models
4
  ctranslate2==4.6.0
5
  faster_whisper==1.2.0
6
  hf_transfer==0.1.9
 
13
  spaces==0.39.0
14
  peft==0.11.1
15
  whisper_timestamped==1.15.8
16
+ typing==3.7.4.3
requirements_dev.txt DELETED
@@ -1,171 +0,0 @@
1
- accelerate==1.10.0
2
- aina-gradio-theme==2.3
3
- aiofiles==24.1.0
4
- aiohappyeyeballs==2.6.1
5
- aiohttp==3.12.15
6
- aiosignal==1.4.0
7
- alembic==1.16.4
8
- annotated-types==0.7.0
9
- antlr4-python3-runtime==4.9.3
10
- anyio==4.10.0
11
- asteroid-filterbanks==0.4.0
12
- async-timeout==5.0.1
13
- attrs==25.3.0
14
- audioread==3.0.1
15
- av==15.0.0
16
- Brotli==1.1.0
17
- certifi==2025.8.3
18
- cffi==1.17.1
19
- charset-normalizer==3.4.2
20
- click==8.2.1
21
- coloredlogs==15.0.1
22
- colorlog==6.9.0
23
- contourpy==1.3.2
24
- ctranslate2==4.6.0
25
- cycler==0.12.1
26
- Cython==3.1.2
27
- decorator==5.2.1
28
- docopt==0.6.2
29
- dtw-python==1.5.3
30
- einops==0.8.1
31
- exceptiongroup==1.3.0
32
- fastapi==0.116.1
33
- faster-whisper==1.2.0
34
- ffmpeg-python==0.2.0
35
- ffmpy==0.6.1
36
- filelock==3.18.0
37
- flatbuffers==25.2.10
38
- fonttools==4.59.0
39
- frozenlist==1.7.0
40
- fsspec==2025.7.0
41
- future==1.0.0
42
- gradio==5.41.1
43
- gradio_client==1.11.0
44
- greenlet==3.2.3
45
- groovy==0.1.2
46
- h11==0.16.0
47
- hf-xet==1.1.7
48
- hf_transfer==0.1.9
49
- httpcore==1.0.9
50
- httpx==0.28.1
51
- huggingface-hub==0.34.3
52
- humanfriendly==10.0
53
- HyperPyYAML==1.2.2
54
- idna==3.10
55
- Jinja2==3.1.6
56
- joblib==1.5.1
57
- julius==0.2.7
58
- kiwisolver==1.4.8
59
- lazy_loader==0.4
60
- librosa==0.10.1
61
- lightning==2.5.2
62
- lightning-utilities==0.15.2
63
- llvmlite==0.44.0
64
- Mako==1.3.10
65
- markdown-it-py==3.0.0
66
- MarkupSafe==3.0.2
67
- matplotlib==3.10.5
68
- mdurl==0.1.2
69
- more-itertools==10.7.0
70
- mpmath==1.3.0
71
- msgpack==1.1.1
72
- multidict==6.6.3
73
- networkx==3.4.2
74
- numba==0.61.2
75
- numpy==2.2.6
76
- nvidia-cublas-cu12==12.8.4.1
77
- nvidia-cuda-cupti-cu12==12.8.90
78
- nvidia-cuda-nvrtc-cu12==12.8.93
79
- nvidia-cuda-runtime-cu12==12.8.90
80
- nvidia-cudnn-cu12==9.10.2.21
81
- nvidia-cufft-cu12==11.3.3.83
82
- nvidia-cufile-cu12==1.13.1.3
83
- nvidia-curand-cu12==10.3.9.90
84
- nvidia-cusolver-cu12==11.7.3.90
85
- nvidia-cusparse-cu12==12.5.8.93
86
- nvidia-cusparselt-cu12==0.7.1
87
- nvidia-nccl-cu12==2.27.3
88
- nvidia-nvjitlink-cu12==12.8.93
89
- nvidia-nvtx-cu12==12.8.90
90
- omegaconf==2.3.0
91
- onnxruntime==1.22.1
92
- openai-whisper==20250625
93
- optuna==4.4.0
94
- orjson==3.11.1
95
- packaging==25.0
96
- pandas==2.3.1
97
- peft==0.11.1
98
- pillow==11.3.0
99
- platformdirs==4.3.8
100
- pooch==1.8.2
101
- primePy==1.3
102
- propcache==0.3.2
103
- protobuf==6.31.1
104
- psutil==5.9.8
105
- pyannote.audio==3.3.2
106
- pyannote.core==5.0.0
107
- pyannote.database==5.1.3
108
- pyannote.metrics==3.2.1
109
- pyannote.pipeline==3.0.1
110
- pycparser==2.22
111
- pydantic==2.11.7
112
- pydantic_core==2.33.2
113
- pydub==0.25.1
114
- Pygments==2.19.2
115
- pyparsing==3.2.3
116
- python-dateutil==2.9.0.post0
117
- python-multipart==0.0.20
118
- pytorch-lightning==2.5.2
119
- pytorch-metric-learning==2.8.1
120
- pytz==2025.2
121
- PyYAML==6.0.2
122
- regex==2025.7.34
123
- requests==2.32.4
124
- rich==14.1.0
125
- ruamel.yaml==0.18.14
126
- ruamel.yaml.clib==0.2.12
127
- ruff==0.12.7
128
- safehttpx==0.1.6
129
- safetensors==0.6.1
130
- scikit-learn==1.7.1
131
- scipy==1.15.3
132
- semantic-version==2.10.0
133
- semver==3.0.4
134
- sentencepiece==0.2.0
135
- shellingham==1.5.4
136
- six==1.17.0
137
- sniffio==1.3.1
138
- sortedcontainers==2.4.0
139
- soundfile==0.13.1
140
- soxr==0.5.0.post1
141
- spaces==0.39.0
142
- speechbrain==1.0.3
143
- SQLAlchemy==2.0.42
144
- starlette==0.47.2
145
- sympy==1.14.0
146
- tabulate==0.9.0
147
- tensorboardX==2.6.4
148
- threadpoolctl==3.6.0
149
- tiktoken==0.10.0
150
- tokenizers==0.21.4
151
- tomli==2.2.1
152
- tomlkit==0.13.3
153
- torch==2.8.0
154
- torch-audiomentations==0.12.0
155
- torch_pitch_shift==1.2.5
156
- torchaudio==2.8.0
157
- torchmetrics==1.8.0
158
- tqdm==4.67.1
159
- transformers==4.55.0
160
- triton==3.4.0
161
- typer==0.16.0
162
- typing==3.7.4.3
163
- typing-inspection==0.4.1
164
- typing_extensions==4.14.1
165
- tzdata==2025.2
166
- urllib3==2.5.0
167
- uvicorn==0.35.0
168
- websockets==15.0.1
169
- whisper-timestamped==1.15.8
170
- yarl==1.20.1
171
- yt-dlp==2025.7.21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
settings.py CHANGED
@@ -1,8 +1,13 @@
1
  DEBUG_MODE = True
2
  MODEL_PATH_V1 = "projecte-aina/whisper-large-v3-tiny-caesar"
3
  MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
 
 
4
  LEFT_CHANNEL_TEMP_PATH = "temp_mono_speaker2.wav"
5
  RIGHT_CHANNEL_TEMP_PATH = "temp_mono_speaker1.wav"
6
  RESAMPLING_FREQ = 16000
 
 
 
7
  BATCH_SIZE = 1
8
  TASK = "transcribe"
 
1
  DEBUG_MODE = True
2
  MODEL_PATH_V1 = "projecte-aina/whisper-large-v3-tiny-caesar"
3
  MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
4
+ MODEL_PATH_AGE_GENDER = "tiantiaf/wavlm-large-age-sex"
5
+ MODEL_PATH_METEO = "jayebaku/XLMRoberta-twitter-crexdata-flood-wildfire-detector"
6
  LEFT_CHANNEL_TEMP_PATH = "temp_mono_speaker2.wav"
7
  RIGHT_CHANNEL_TEMP_PATH = "temp_mono_speaker1.wav"
8
  RESAMPLING_FREQ = 16000
9
+ ORIGINAL_FREQ = 8000
10
+ MIN_SIL_DURATION = 3.0
11
+ SIL_THRESHOLD = -35
12
  BATCH_SIZE = 1
13
  TASK = "transcribe"
shout_detector.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ from scipy.signal import butter, sosfilt
3
+ import numpy as np
4
+ from settings import DEBUG_MODE, RESAMPLING_FREQ
5
+ from audio_utils import sec_to_hhmmss
6
+
7
+ def bandpass_filter(audio_path, RESAMPLING_FREQ, low=300, high=3400):
8
+ sos = butter(4, [low / (RESAMPLING_FREQ / 2), high / (RESAMPLING_FREQ / 2)], btype="band", output="sos")
9
+ return sosfilt(sos, audio_path)
10
+
11
+
12
+ def extract_features(audio_path, RESAMPLING_FREQ, frame=0.05):
13
+ hop = int(RESAMPLING_FREQ * frame)
14
+ rms = librosa.feature.rms(y=audio_path, hop_length=hop)[0]
15
+ flux = librosa.onset.onset_strength(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)
16
+ rolloff = librosa.feature.spectral_rolloff(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)[0]
17
+ harmonic = librosa.effects.harmonic(audio_path)
18
+ percussive = audio_path - harmonic
19
+ hnr = librosa.feature.rms(y=harmonic, hop_length=hop)[0] / (librosa.feature.rms(y=percussive, hop_length=hop)[0] + 1e-6)
20
+
21
+ times = librosa.frames_to_time(np.arange(len(rms)), sr=RESAMPLING_FREQ, hop_length=hop)
22
+ return rms, flux, rolloff, hnr, times
23
+
24
+ def compute_intensity(rms, flux, rolloff, hnr):
25
+ rms_w, flux_w, roll_w, hnr_w = 3.0, 1.3, 1.0, 0.8
26
+
27
+ r = (rms - np.mean(rms[:30])) / (np.std(rms[:30]) + 1e-5)
28
+ f = flux / (np.percentile(flux, 90) + 1e-6)
29
+ ro = rolloff / np.max(rolloff)
30
+ hn = hnr / np.max(hnr)
31
+
32
+ intensity = (
33
+ rms_w * np.clip(r, 0, None)
34
+ + flux_w * f
35
+ + roll_w * ro
36
+ + hnr_w * (1 - hn)
37
+ )
38
+
39
+ intensity = np.maximum(intensity, 0)
40
+ intensity = librosa.util.normalize(intensity)
41
+ return intensity
42
+
43
+
44
+ def segment_intensity(times, intensity, thr=0.25):
45
+ ema_alpha = 0.45
46
+ hangover = int(0.15 / (times[1] - times[0]))
47
+
48
+ smooth = np.copy(intensity)
49
+ for i in range(1, len(intensity)):
50
+ smooth[i] = ema_alpha * intensity[i] + (1 - ema_alpha) * smooth[i - 1]
51
+
52
+ on_thr, off_thr = thr, thr * 0.6
53
+ active = False
54
+ counter = 0
55
+ events = []
56
+ start = None
57
+
58
+ for i, val in enumerate(smooth):
59
+ if not active and val >= on_thr:
60
+ active = True
61
+ start = times[i]
62
+
63
+ if active and val >= off_thr:
64
+ counter = hangover
65
+ elif active:
66
+ counter -= 1
67
+ if counter <= 0:
68
+ active = False
69
+ events.append((start, times[i]))
70
+ start = None
71
+
72
+ if active and start is not None:
73
+ events.append((start, times[-1]))
74
+ return events, smooth
75
+
76
+
77
+ def assign_levels(events, intensity, times):
78
+ results = []
79
+ for st, en in events:
80
+ mask = (times >= st) & (times <= en)
81
+ if np.sum(mask) == 0:
82
+ continue
83
+
84
+ med = np.median(intensity[mask])
85
+ max_val = np.max(intensity[mask])
86
+
87
+ if med > 0.8:
88
+ lvl = "4 gritando"
89
+ elif med > 0.6:
90
+ lvl = "3 elevado"
91
+ elif med > 0.4:
92
+ lvl = "2 intermedio"
93
+ else:
94
+ lvl = "1 bajo"
95
+
96
+ results.append((st, en, lvl, med, max_val))
97
+ return results
98
+
99
+ def merge_adjacent_segments(results, gap_threshold=0.3):
100
+
101
+ if not results:
102
+ return []
103
+
104
+ merged = []
105
+ cur_st, cur_en, cur_lvl, cur_med, cur_max = results[0]
106
+
107
+ for st, en, lvl, med, mx in results[1:]:
108
+ if lvl == cur_lvl and st - cur_en <= gap_threshold:
109
+ cur_en = en
110
+ cur_med = (cur_med + med) / 2
111
+ cur_max = max(cur_max, mx)
112
+ else:
113
+ merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
114
+ cur_st, cur_en, cur_lvl, cur_med, cur_max = st, en, lvl, med, mx
115
+
116
+ merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
117
+ return merged
118
+
119
+
120
+ def shout(audio_path):
121
+
122
+ if DEBUG_MODE:
123
+ print(f"[MODEL LOADING] Loading shout model")
124
+
125
+ y, sr = librosa.load(audio_path, sr=RESAMPLING_FREQ, mono=True)
126
+ y = bandpass_filter(y, sr)
127
+
128
+ rms, flux, rolloff, hnr, times = extract_features(y, sr)
129
+ intensity = compute_intensity(rms, flux, rolloff, hnr)
130
+ events, _ = segment_intensity(times, intensity, thr=0.18)
131
+ results = assign_levels(events, intensity, times)
132
+ results = merge_adjacent_segments(results, gap_threshold=1)
133
+
134
+ results = [
135
+ (st, en, lvl, med, max_val)
136
+ for st, en, lvl, med, max_val in results
137
+ if "elevado" in lvl or "gritando" in lvl
138
+
139
+ ]
140
+ formatted = []
141
+ for st, en, lvl, med, max_val in results:
142
+ formatted.append(f"{sec_to_hhmmss(st)} – {sec_to_hhmmss(en)} | volumen de voz: {lvl}")
143
+
144
+ if not formatted:
145
+ return "No se detectaron gritos o voces elevadas"
146
+
147
+ return "\n".join(formatted)
148
+
silence_detector.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ from settings import DEBUG_MODE, RESAMPLING_FREQ, ORIGINAL_FREQ, MIN_SIL_DURATION, SIL_THRESHOLD
4
+ from audio_utils import sec_to_hhmmss
5
+
6
+ def silence(audio_path):
7
+
8
+ if DEBUG_MODE:
9
+ print(f"[MODEL LOADING] Loading silence model")
10
+
11
+ y, sr = librosa.load(audio_path, sr=ORIGINAL_FREQ, mono=True) #merging stereo2mono
12
+ y = librosa.resample(y, orig_sr=ORIGINAL_FREQ, target_sr=RESAMPLING_FREQ)
13
+ y = y / np.max(np.abs(y))
14
+
15
+ frame_length = int(0.1 * RESAMPLING_FREQ)
16
+ hop_length = frame_length
17
+ rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
18
+ rms_db = librosa.amplitude_to_db(rms, ref=np.max)
19
+
20
+ silence_mask = rms_db < SIL_THRESHOLD
21
+ frame_duration = hop_length / RESAMPLING_FREQ
22
+
23
+ silence_segments = []
24
+ start = None
25
+ for i, silent in enumerate(silence_mask):
26
+ if silent and start is None:
27
+ start = i * frame_duration
28
+ elif not silent and start is not None:
29
+ end = i * frame_duration
30
+ if end - start >= MIN_SIL_DURATION:
31
+ silence_segments.append((start, end))
32
+ start = None
33
+ if start is not None:
34
+ end = len(silence_mask) * frame_duration
35
+ if end - start >= MIN_SIL_DURATION:
36
+ silence_segments.append((start, end))
37
+
38
+ if silence_segments:
39
+ events = [f"{sec_to_hhmmss(s)} – {sec_to_hhmmss(e)}" for s, e in silence_segments]
40
+ event = "Silencios detectados en: " + ", ".join(events)
41
+ else:
42
+ event = "No se detectaron silencios prolongados"
43
+
44
+ return event
whisper_cs.py DELETED
@@ -1,382 +0,0 @@
1
- import spaces
2
- from pydub import AudioSegment
3
- import os
4
- import torchaudio
5
- import torch
6
- import re
7
- import whisper_timestamped as whisper_ts
8
- from typing import Dict
9
- from faster_whisper import WhisperModel
10
-
11
- device = 0 if torch.cuda.is_available() else "cpu"
12
- torch_dtype = torch.float32
13
-
14
- DEBUG_MODE = True
15
- MODEL_PATH_V2 = "langtech-veu/whisper-timestamped-cs"
16
- MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
17
- #DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
- #print("[INFO] CUDA available:", torch.cuda.is_available())
19
-
20
- def clean_text(input_text):
21
- remove_chars = ['.', ',', ';', ':', '¿', '?', '«', '»', '-', '¡', '!', '@',
22
- '*', '{', '}', '[', ']', '=', '/', '\\', '&', '#', '…']
23
- output_text = ''.join(char if char not in remove_chars else ' ' for char in input_text)
24
- return ' '.join(output_text.split()).lower()
25
-
26
-
27
- def split_stereo_channels(audio_path):
28
- ext = os.path.splitext(audio_path)[1].lower()
29
-
30
- if ext == ".wav":
31
- audio = AudioSegment.from_wav(audio_path)
32
- elif ext == ".mp3":
33
- audio = AudioSegment.from_file(audio_path, format="mp3")
34
- else:
35
- raise ValueError(f"Unsupported file format: {audio_path}")
36
-
37
- channels = audio.split_to_mono()
38
- if len(channels) != 2:
39
- raise ValueError(f"Audio {audio_path} does not have 2 channels.")
40
-
41
- channels[0].export(f"temp_mono_speaker1.wav", format="wav") # Right
42
- channels[1].export(f"temp_mono_speaker2.wav", format="wav") # Left
43
-
44
-
45
- def format_audio(audio_path):
46
- input_audio, sample_rate = torchaudio.load(audio_path)
47
- if input_audio.shape[0] == 2:
48
- input_audio = torch.mean(input_audio, dim=0, keepdim=True)
49
- resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
50
- input_audio = resampler(input_audio)
51
- return input_audio.squeeze(), 16000
52
-
53
- def post_process_transcription(transcription, max_repeats=2):
54
- tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
55
-
56
- cleaned_tokens = []
57
- repetition_count = 0
58
- previous_token = None
59
-
60
- for token in tokens:
61
- reduced_token = re.sub(r"(\w{1,3})(\1{2,})", "", token)
62
-
63
- if reduced_token == previous_token:
64
- repetition_count += 1
65
- if repetition_count <= max_repeats:
66
- cleaned_tokens.append(reduced_token)
67
- else:
68
- repetition_count = 1
69
- cleaned_tokens.append(reduced_token)
70
-
71
- previous_token = reduced_token
72
-
73
- cleaned_transcription = " ".join(cleaned_tokens)
74
- cleaned_transcription = re.sub(r'\s+', ' ', cleaned_transcription).strip()
75
-
76
- return cleaned_transcription
77
-
78
-
79
- def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
80
- segments = re.split(r'(\[SPEAKER_\d{2}\])', transcription_text)
81
- merged_transcription = ''
82
- current_speaker = None
83
- current_segment = []
84
-
85
- for i in range(1, len(segments) - 1, 2):
86
- speaker_tag = segments[i]
87
- text = segments[i + 1].strip()
88
-
89
- speaker = re.search(r'\d{2}', speaker_tag).group()
90
-
91
- if speaker == current_speaker:
92
- current_segment.append(text)
93
- else:
94
- if current_speaker is not None:
95
- merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
96
- current_speaker = speaker
97
- current_segment = [text]
98
-
99
- if current_speaker is not None:
100
- merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
101
-
102
- return merged_transcription.strip()
103
-
104
- def cleanup_temp_files(*file_paths):
105
-
106
- if DEBUG_MODE: print(f"Entered cleanup_temp_files function...")
107
-
108
- if DEBUG_MODE: print(f"file_paths: {file_paths}")
109
-
110
- for path in file_paths:
111
- if path and os.path.exists(path):
112
- if DEBUG_MODE: print(f"Removing path: {path}")
113
- os.remove(path)
114
-
115
- if DEBUG_MODE: print(f"Exited cleanup_temp_files function.")
116
-
117
- '''
118
- try:
119
- faster_model = WhisperModel(
120
- MODEL_PATH_V2_FAST,
121
- device="cuda" if torch.cuda.is_available() else "cpu",
122
- compute_type="float16" if torch.cuda.is_available() else "int8"
123
- )
124
- except RuntimeError as e:
125
- print(f"[WARNING] Failed to load model on GPU: {e}")
126
- faster_model = WhisperModel(
127
- MODEL_PATH_V2_FAST,
128
- device="cpu",
129
- compute_type="int8"
130
- )
131
- '''
132
-
133
- #faster_model = WhisperModel(MODEL_PATH_V2_FAST, device=DEVICE, compute_type="int8")
134
-
135
- def load_whisper_model(model_path: str):
136
- device = "cuda" if torch.cuda.is_available() else "cpu"
137
- model = whisper_ts.load_model(model_path, device=device)
138
- return model
139
-
140
- def transcribe_audio(model, audio_path: str) -> Dict:
141
- try:
142
- result = whisper_ts.transcribe(
143
- model,
144
- audio_path,
145
- beam_size=5,
146
- best_of=5,
147
- temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
148
- vad=False,
149
- detect_disfluencies=True,
150
- )
151
-
152
- words = []
153
- for segment in result.get('segments', []):
154
- for word in segment.get('words', []):
155
- word_text = word.get('word', '').strip()
156
- if word_text.startswith(' '):
157
- word_text = word_text[1:]
158
-
159
- words.append({
160
- 'word': word_text,
161
- 'start': word.get('start', 0),
162
- 'end': word.get('end', 0),
163
- 'confidence': word.get('confidence', 0)
164
- })
165
-
166
- return {
167
- 'audio_path': audio_path,
168
- 'text': result['text'].strip(),
169
- 'segments': result.get('segments', []),
170
- 'words': words,
171
- 'duration': result.get('duration', 0),
172
- 'success': True
173
- }
174
-
175
- except Exception as e:
176
- return {
177
- 'audio_path': audio_path,
178
- 'error': str(e),
179
- 'success': False
180
- }
181
-
182
-
183
-
184
- def generate(audio_path, use_v2_fast):
185
- if DEBUG_MODE: print(f"Entering generate function...")
186
- if DEBUG_MODE: print(f"use_v2_fast: {use_v2_fast}")
187
-
188
- faster_model = None
189
-
190
- if use_v2_fast:
191
- if torch.cuda.is_available():
192
- try:
193
- if DEBUG_MODE: print("[INFO] GPU detected. Loading model on GPU with float16...")
194
- faster_model = WhisperModel(
195
- MODEL_PATH_V2_FAST,
196
- device="cuda",
197
- compute_type="float16"
198
- )
199
- except RuntimeError as e:
200
- print(f"[WARNING] Failed to load model on GPU: {e}")
201
- if DEBUG_MODE: print("[INFO] Falling back to CPU with int8...")
202
- faster_model = WhisperModel(
203
- MODEL_PATH_V2_FAST,
204
- device="cpu",
205
- compute_type="int8"
206
- )
207
- else:
208
- if DEBUG_MODE: print("[INFO] No GPU detected. Loading model on CPU with int8...")
209
- faster_model = WhisperModel(
210
- MODEL_PATH_V2_FAST,
211
- device="cpu",
212
- compute_type="int8"
213
- )
214
- split_stereo_channels(audio_path)
215
- left_channel_path = "temp_mono_speaker2.wav"
216
- right_channel_path = "temp_mono_speaker1.wav"
217
-
218
- left_waveform, _ = format_audio(left_channel_path)
219
- right_waveform, _ = format_audio(right_channel_path)
220
-
221
- left_waveform = left_waveform.numpy().astype("float32")
222
- right_waveform = right_waveform.numpy().astype("float32")
223
-
224
- left_result, _ = faster_model.transcribe(left_waveform, beam_size=5, task="transcribe")
225
- right_result, _ = faster_model.transcribe(right_waveform, beam_size=5, task="transcribe")
226
-
227
- left_result = list(left_result)
228
- right_result = list(right_result)
229
-
230
- def get_faster_segments(segments, speaker_label):
231
- return [
232
- (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
233
- for seg in segments if seg.text
234
- ]
235
-
236
- left_segs = get_faster_segments(left_result, "Speaker 1")
237
- right_segs = get_faster_segments(right_result, "Speaker 2")
238
-
239
- merged_transcript = sorted(
240
- left_segs + right_segs,
241
- key=lambda x: float(x[0]) if x[0] is not None else float("inf")
242
- )
243
-
244
- clean_output = ""
245
- for start, end, speaker, text in merged_transcript:
246
- clean_output += f"[{speaker}]: {text}\n"
247
-
248
- if DEBUG_MODE: print(f"clean_output: {clean_output}")
249
-
250
- else:
251
- model = load_whisper_model(MODEL_PATH_V2)
252
- split_stereo_channels(audio_path)
253
- left_channel_path = "temp_mono_speaker2.wav"
254
- right_channel_path = "temp_mono_speaker1.wav"
255
-
256
- left_waveform, _ = format_audio(left_channel_path)
257
- right_waveform, _ = format_audio(right_channel_path)
258
-
259
- left_result = transcribe_audio(model, left_waveform)
260
- right_result = transcribe_audio(model, right_waveform)
261
-
262
- def get_segments(result, speaker_label):
263
- segments = result.get("segments", [])
264
- if not segments:
265
- return []
266
- return [
267
- (seg.get("start", 0.0), seg.get("end", 0.0), speaker_label,
268
- post_process_transcription(seg.get("text", "").strip()))
269
- for seg in segments if seg.get("text")
270
- ]
271
-
272
- left_segs = get_segments(left_result, "Speaker 1")
273
- right_segs = get_segments(right_result, "Speaker 2")
274
-
275
- merged_transcript = sorted(
276
- left_segs + right_segs,
277
- key=lambda x: float(x[0]) if x[0] is not None else float("inf")
278
- )
279
-
280
- clean_output = ""
281
- for start, end, speaker, text in merged_transcript:
282
- clean_output += f"[{speaker}]: {text}\n"
283
-
284
- cleanup_temp_files("temp_mono_speaker1.wav", "temp_mono_speaker2.wav")
285
-
286
- if DEBUG_MODE: print(f"Exiting generate function...")
287
- return clean_output.strip()
288
-
289
-
290
- '''
291
- def generate(audio_path, use_v2_fast):
292
-
293
- if DEBUG_MODE: print(f"Entering generate function...")
294
- if DEBUG_MODE: print(f"use_v2_fast: {use_v2_fast}")
295
-
296
- if use_v2_fast:
297
- split_stereo_channels(audio_path)
298
- left_channel_path = "temp_mono_speaker2.wav"
299
- right_channel_path = "temp_mono_speaker1.wav"
300
-
301
- left_waveform, left_sr = format_audio(left_channel_path)
302
- right_waveform, right_sr = format_audio(right_channel_path)
303
-
304
- left_waveform = left_waveform.numpy().astype("float32")
305
- right_waveform = right_waveform.numpy().astype("float32")
306
-
307
- left_result, info = faster_model.transcribe(left_waveform, beam_size=5, task="transcribe")
308
- right_result, info = faster_model.transcribe(right_waveform, beam_size=5, task="transcribe")
309
-
310
- left_result = list(left_result)
311
- right_result = list(right_result)
312
-
313
- def get_faster_segments(segments, speaker_label):
314
- return [
315
- (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
316
- for seg in segments if seg.text
317
- ]
318
-
319
- left_segs = get_faster_segments(left_result, "Speaker 1")
320
- right_segs = get_faster_segments(right_result, "Speaker 2")
321
-
322
- merged_transcript = sorted(
323
- left_segs + right_segs,
324
- key=lambda x: float(x[0]) if x[0] is not None else float("inf")
325
- )
326
-
327
- clean_output = ""
328
- for start, end, speaker, text in merged_transcript:
329
- clean_output += f"[{speaker}]: {text}\n"
330
-
331
- # FIX Seems that post_merge_consecutive_segments_from_text returns an empty string
332
- #clean_output = post_merge_consecutive_segments_from_text(clean_output)
333
- #print('clean_output',clean_output)
334
-
335
- if DEBUG_MODE: print(f"clean_output: {clean_output}")
336
-
337
- else:
338
- model = load_whisper_model(MODEL_PATH_V2)
339
- split_stereo_channels(audio_path)
340
-
341
- left_channel_path = "temp_mono_speaker2.wav"
342
- right_channel_path = "temp_mono_speaker1.wav"
343
-
344
- left_waveform, left_sr = format_audio(left_channel_path)
345
- right_waveform, right_sr = format_audio(right_channel_path)
346
- left_result = transcribe_audio(model, left_waveform)
347
- right_result = transcribe_audio(model, right_waveform)
348
-
349
- def get_segments(result, speaker_label):
350
- segments = result.get("segments", [])
351
- if not segments:
352
- return []
353
- return [
354
- (seg.get("start", 0.0), seg.get("end", 0.0), speaker_label, post_process_transcription(seg.get("text", "").strip()))
355
- for seg in segments if seg.get("text")
356
- ]
357
-
358
- left_segs = get_segments(left_result, "Speaker 1")
359
- right_segs = get_segments(right_result, "Speaker 2")
360
-
361
- merged_transcript = sorted(
362
- left_segs + right_segs,
363
- key=lambda x: float(x[0]) if x[0] is not None else float("inf")
364
- )
365
-
366
- output = ""
367
- for start, end, speaker, text in merged_transcript:
368
- output += f"[{speaker}]: {text}\n"
369
-
370
- clean_output = output.strip()
371
-
372
- if DEBUG_MODE: print(f"Clean output generated.")
373
-
374
- cleanup_temp_files(
375
- "temp_mono_speaker1.wav",
376
- "temp_mono_speaker2.wav"
377
- )
378
-
379
- if DEBUG_MODE: print(f"Exiting generate function...")
380
-
381
- return clean_output
382
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_cs_fase_1.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ from transformers import pipeline
3
+ import os
4
+ import time
5
+ from settings import MODEL_PATH_V2_FAST, MODEL_PATH_V1, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH, BATCH_SIZE, TASK
6
+ from audio_utils import debug_print, get_settings, split_input_stereo_channels, format_audio, process_waveforms, post_process_transcripts, post_process_transcription, post_merge_consecutive_segments_from_text, cleanup_temp_files
7
+
8
+ hf_token = os.getenv("HF_TOKEN")
9
+
10
+ ASR_MODEL_V2 = None
11
+ ASR_MODEL_V1 = None
12
+
13
+ def get_asr_model_v2(DEVICE, COMPUTE_TYPE):
14
+ global ASR_MODEL_V2
15
+ if ASR_MODEL_V2 is None:
16
+ debug_print("[MODEL LOADING] Loading ASR v2_fast model...")
17
+ ASR_MODEL_V2 = WhisperModel(
18
+ MODEL_PATH_V2_FAST,
19
+ device=DEVICE,
20
+ compute_type=COMPUTE_TYPE
21
+ )
22
+ debug_print("[MODEL LOADING]v2_fast model loaded")
23
+ return ASR_MODEL_V2
24
+
25
+ def get_asr_model_v1(DEVICE):
26
+ global ASR_MODEL_V1
27
+ if ASR_MODEL_V1 is None:
28
+ debug_print("[MODEL LOADING]Loading ASR v1 pipeline model...")
29
+ ASR_MODEL_V1 = pipeline(
30
+ task="automatic-speech-recognition",
31
+ model=MODEL_PATH_V1,
32
+ chunk_length_s=30,
33
+ device=0 if DEVICE == "cuda" else -1,
34
+ token=hf_token
35
+ )
36
+ debug_print("[MODEL LOADING]ASR v1 model loaded")
37
+ return ASR_MODEL_V1
38
+
39
+ def transcribe_asr(audio, model):
40
+ text = model(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": TASK}, return_timestamps=True)["text"]
41
+ return text
42
+
43
+ def transcribe_faster_asr(left_waveform, right_waveform, model):
44
+ left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
45
+ right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
46
+ return list(left_result), list(right_result)
47
+
48
+ def generate_fase_1(audio_path, model_version, civil_channel):
49
+ DEVICE, COMPUTE_TYPE = get_settings()
50
+
51
+ debug_print(f"[Fase1] Starting inference with model version: {model_version}")
52
+
53
+ if model_version == "v2_fast":
54
+ asr_model = get_asr_model_v2(DEVICE, COMPUTE_TYPE)
55
+ actual_compute_type = asr_model.model.compute_type
56
+ debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}")
57
+
58
+ split_input_stereo_channels(audio_path)
59
+ left_waveform, right_waveform = process_waveforms(DEVICE, actual_compute_type)
60
+
61
+ debug_print(f"[SETTINGS] Civil channel: {civil_channel}")
62
+ left_result, right_result = transcribe_faster_asr(left_waveform, right_waveform, asr_model)
63
+
64
+ text, _ = post_process_transcripts(left_result, right_result, civil_channel)
65
+ cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH)
66
+ else:
67
+ actual_compute_type = "float32" # HF pipeline safe default
68
+ debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}")
69
+
70
+ asr_model = get_asr_model_v1(DEVICE)
71
+ audio = format_audio(audio_path, actual_compute_type, DEVICE)
72
+ result = transcribe_asr(audio, asr_model)
73
+ text = post_process_transcription(result)
74
+
75
+ return text
whisper_cs_fase_2.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ from transformers import pipeline
3
+ import os
4
+ from settings import MODEL_PATH_AGE_GENDER, MODEL_PATH_METEO, MODEL_PATH_V2_FAST, LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH
5
+ from audio_utils import debug_print, get_settings, split_input_stereo_channels, process_waveforms, post_process_transcripts, post_merge_consecutive_segments_from_text, cleanup_temp_files
6
+ from shout_detector import shout
7
+ from silence_detector import silence
8
+ from meteo_detector import classify_meteo_event
9
+ from age_gender_detector import age_gender, WavLMWrapper
10
+
11
+ hf_token = os.getenv("HF_AUTH_TOKEN")
12
+
13
+ ASR_MODEL = None
14
+ AGE_GENDER_MODEL = None
15
+ METEO_MODEL = None
16
+
17
+ def get_asr_model(DEVICE, COMPUTE_TYPE):
18
+ global ASR_MODEL
19
+ if ASR_MODEL is None:
20
+ debug_print("[MODEL LOADING]Loading ASR model...")
21
+ ASR_MODEL = WhisperModel(
22
+ MODEL_PATH_V2_FAST,
23
+ device=DEVICE,
24
+ compute_type=COMPUTE_TYPE
25
+ )
26
+ debug_print("[MODEL LOADING]ASR model loaded")
27
+ return ASR_MODEL
28
+
29
+ def get_age_gender_model(DEVICE):
30
+ global AGE_GENDER_MODEL
31
+ if AGE_GENDER_MODEL is None:
32
+ debug_print("[MODEL LOADING]Loading Age/Gender model...")
33
+ AGE_GENDER_MODEL = WavLMWrapper.from_pretrained(MODEL_PATH_AGE_GENDER).to(DEVICE)
34
+ AGE_GENDER_MODEL.eval()
35
+ debug_print("[MODEL LOADING]Age/Gender model loaded")
36
+ return AGE_GENDER_MODEL
37
+
38
+ def get_meteo_model(DEVICE):
39
+ global METEO_MODEL
40
+ if METEO_MODEL is None:
41
+ debug_print("[MODEL LOADING]Loading Meteo model...")
42
+ METEO_MODEL = pipeline(
43
+ task="text-classification",
44
+ model=MODEL_PATH_METEO,
45
+ tokenizer=MODEL_PATH_METEO,
46
+ top_k=None,
47
+ device=0 if DEVICE == "cuda" else -1,
48
+ token=hf_token
49
+ )
50
+ debug_print("[MODEL LOADING]Meteo model loaded")
51
+ return METEO_MODEL
52
+
53
+ def transcribe_faster_asr(left_waveform, right_waveform, model):
54
+ left_result, _ = model.transcribe(left_waveform, beam_size=5, task="transcribe")
55
+ right_result, _ = model.transcribe(right_waveform, beam_size=5, task="transcribe")
56
+ return list(left_result), list(right_result)
57
+
58
+ def generate_fase_2(audio_path, model_version, civil_channel):
59
+
60
+ DEVICE, COMPUTE_TYPE = get_settings()
61
+
62
+ asr_model = get_asr_model(DEVICE, COMPUTE_TYPE)
63
+ age_gender_model = get_age_gender_model(DEVICE)
64
+ meteo_model = get_meteo_model(DEVICE)
65
+
66
+ actual_compute_type = asr_model.model.compute_type
67
+ debug_print(f"[SETTINGS] Device: {DEVICE}, Compute type: {actual_compute_type}")
68
+
69
+ split_input_stereo_channels(audio_path)
70
+ left_waveform, right_waveform = process_waveforms(DEVICE, actual_compute_type)
71
+
72
+ debug_print(f"[SETTINGS] Civil channel: {civil_channel}")
73
+ left_result, right_result = transcribe_faster_asr(left_waveform, right_waveform, asr_model)
74
+
75
+ silence_event = silence(audio_path)
76
+ civil_waveform = left_waveform if civil_channel == "Left" else right_waveform
77
+ civil_path = LEFT_CHANNEL_TEMP_PATH if civil_channel == "Left" else RIGHT_CHANNEL_TEMP_PATH
78
+ shout_event = shout(civil_path)
79
+ age, sex, age_group = age_gender(civil_waveform, age_gender_model, DEVICE)
80
+ age = f"{age_group} (aprox. {age} años)"
81
+
82
+ clean_output_asr, clean_output_meteo = post_process_transcripts(left_result, right_result, civil_channel)
83
+ text = '\n' + clean_output_asr
84
+
85
+ meteo_event = classify_meteo_event(clean_output_meteo, meteo_model, threshold=0.0)
86
+
87
+ cleanup_temp_files(LEFT_CHANNEL_TEMP_PATH, RIGHT_CHANNEL_TEMP_PATH)
88
+
89
+ return text, sex, age, silence_event, shout_event, meteo_event