asr-inference / shout_detector.py
Sarah Solito
Fase_1 and Fase_2 releases, code cleaned
d6fb6a2
raw
history blame
4.6 kB
import librosa
from scipy.signal import butter, sosfilt
import numpy as np
from settings import DEBUG_MODE, RESAMPLING_FREQ
from audio_utils import sec_to_hhmmss
def bandpass_filter(audio_path, RESAMPLING_FREQ, low=300, high=3400):
sos = butter(4, [low / (RESAMPLING_FREQ / 2), high / (RESAMPLING_FREQ / 2)], btype="band", output="sos")
return sosfilt(sos, audio_path)
def extract_features(audio_path, RESAMPLING_FREQ, frame=0.05):
hop = int(RESAMPLING_FREQ * frame)
rms = librosa.feature.rms(y=audio_path, hop_length=hop)[0]
flux = librosa.onset.onset_strength(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)
rolloff = librosa.feature.spectral_rolloff(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)[0]
harmonic = librosa.effects.harmonic(audio_path)
percussive = audio_path - harmonic
hnr = librosa.feature.rms(y=harmonic, hop_length=hop)[0] / (librosa.feature.rms(y=percussive, hop_length=hop)[0] + 1e-6)
times = librosa.frames_to_time(np.arange(len(rms)), sr=RESAMPLING_FREQ, hop_length=hop)
return rms, flux, rolloff, hnr, times
def compute_intensity(rms, flux, rolloff, hnr):
rms_w, flux_w, roll_w, hnr_w = 3.0, 1.3, 1.0, 0.8
r = (rms - np.mean(rms[:30])) / (np.std(rms[:30]) + 1e-5)
f = flux / (np.percentile(flux, 90) + 1e-6)
ro = rolloff / np.max(rolloff)
hn = hnr / np.max(hnr)
intensity = (
rms_w * np.clip(r, 0, None)
+ flux_w * f
+ roll_w * ro
+ hnr_w * (1 - hn)
)
intensity = np.maximum(intensity, 0)
intensity = librosa.util.normalize(intensity)
return intensity
def segment_intensity(times, intensity, thr=0.25):
ema_alpha = 0.45
hangover = int(0.15 / (times[1] - times[0]))
smooth = np.copy(intensity)
for i in range(1, len(intensity)):
smooth[i] = ema_alpha * intensity[i] + (1 - ema_alpha) * smooth[i - 1]
on_thr, off_thr = thr, thr * 0.6
active = False
counter = 0
events = []
start = None
for i, val in enumerate(smooth):
if not active and val >= on_thr:
active = True
start = times[i]
if active and val >= off_thr:
counter = hangover
elif active:
counter -= 1
if counter <= 0:
active = False
events.append((start, times[i]))
start = None
if active and start is not None:
events.append((start, times[-1]))
return events, smooth
def assign_levels(events, intensity, times):
results = []
for st, en in events:
mask = (times >= st) & (times <= en)
if np.sum(mask) == 0:
continue
med = np.median(intensity[mask])
max_val = np.max(intensity[mask])
if med > 0.8:
lvl = "4 gritando"
elif med > 0.6:
lvl = "3 elevado"
elif med > 0.4:
lvl = "2 intermedio"
else:
lvl = "1 bajo"
results.append((st, en, lvl, med, max_val))
return results
def merge_adjacent_segments(results, gap_threshold=0.3):
if not results:
return []
merged = []
cur_st, cur_en, cur_lvl, cur_med, cur_max = results[0]
for st, en, lvl, med, mx in results[1:]:
if lvl == cur_lvl and st - cur_en <= gap_threshold:
cur_en = en
cur_med = (cur_med + med) / 2
cur_max = max(cur_max, mx)
else:
merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
cur_st, cur_en, cur_lvl, cur_med, cur_max = st, en, lvl, med, mx
merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
return merged
def shout(audio_path):
if DEBUG_MODE:
print(f"[MODEL LOADING] Loading shout model")
y, sr = librosa.load(audio_path, sr=RESAMPLING_FREQ, mono=True)
y = bandpass_filter(y, sr)
rms, flux, rolloff, hnr, times = extract_features(y, sr)
intensity = compute_intensity(rms, flux, rolloff, hnr)
events, _ = segment_intensity(times, intensity, thr=0.18)
results = assign_levels(events, intensity, times)
results = merge_adjacent_segments(results, gap_threshold=1)
results = [
(st, en, lvl, med, max_val)
for st, en, lvl, med, max_val in results
if "elevado" in lvl or "gritando" in lvl
]
formatted = []
for st, en, lvl, med, max_val in results:
formatted.append(f"{sec_to_hhmmss(st)}{sec_to_hhmmss(en)} | volumen de voz: {lvl}")
if not formatted:
return "No se detectaron gritos o voces elevadas"
return "\n".join(formatted)