asr-inference

Running on Zero

asr-inference / shout_detector.py

Sarah Solito

Fase_1 and Fase_2 releases, code cleaned

d6fb6a2 7 days ago

4.6 kB

	import librosa
	from scipy.signal import butter, sosfilt
	import numpy as np
	from settings import DEBUG_MODE, RESAMPLING_FREQ
	from audio_utils import sec_to_hhmmss

	def bandpass_filter(audio_path, RESAMPLING_FREQ, low=300, high=3400):
	sos = butter(4, [low / (RESAMPLING_FREQ / 2), high / (RESAMPLING_FREQ / 2)], btype="band", output="sos")
	return sosfilt(sos, audio_path)


	def extract_features(audio_path, RESAMPLING_FREQ, frame=0.05):
	hop = int(RESAMPLING_FREQ * frame)
	rms = librosa.feature.rms(y=audio_path, hop_length=hop)[0]
	flux = librosa.onset.onset_strength(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)
	rolloff = librosa.feature.spectral_rolloff(y=audio_path, sr=RESAMPLING_FREQ, hop_length=hop)[0]
	harmonic = librosa.effects.harmonic(audio_path)
	percussive = audio_path - harmonic
	hnr = librosa.feature.rms(y=harmonic, hop_length=hop)[0] / (librosa.feature.rms(y=percussive, hop_length=hop)[0] + 1e-6)

	times = librosa.frames_to_time(np.arange(len(rms)), sr=RESAMPLING_FREQ, hop_length=hop)
	return rms, flux, rolloff, hnr, times

	def compute_intensity(rms, flux, rolloff, hnr):
	rms_w, flux_w, roll_w, hnr_w = 3.0, 1.3, 1.0, 0.8

	r = (rms - np.mean(rms[:30])) / (np.std(rms[:30]) + 1e-5)
	f = flux / (np.percentile(flux, 90) + 1e-6)
	ro = rolloff / np.max(rolloff)
	hn = hnr / np.max(hnr)

	intensity = (
	rms_w * np.clip(r, 0, None)
	+ flux_w * f
	+ roll_w * ro
	+ hnr_w * (1 - hn)
	)

	intensity = np.maximum(intensity, 0)
	intensity = librosa.util.normalize(intensity)
	return intensity


	def segment_intensity(times, intensity, thr=0.25):
	ema_alpha = 0.45
	hangover = int(0.15 / (times[1] - times[0]))

	smooth = np.copy(intensity)
	for i in range(1, len(intensity)):
	smooth[i] = ema_alpha * intensity[i] + (1 - ema_alpha) * smooth[i - 1]

	on_thr, off_thr = thr, thr * 0.6
	active = False
	counter = 0
	events = []
	start = None

	for i, val in enumerate(smooth):
	if not active and val >= on_thr:
	active = True
	start = times[i]

	if active and val >= off_thr:
	counter = hangover
	elif active:
	counter -= 1
	if counter <= 0:
	active = False
	events.append((start, times[i]))
	start = None

	if active and start is not None:
	events.append((start, times[-1]))
	return events, smooth


	def assign_levels(events, intensity, times):
	results = []
	for st, en in events:
	mask = (times >= st) & (times <= en)
	if np.sum(mask) == 0:
	continue

	med = np.median(intensity[mask])
	max_val = np.max(intensity[mask])

	if med > 0.8:
	lvl = "4 gritando"
	elif med > 0.6:
	lvl = "3 elevado"
	elif med > 0.4:
	lvl = "2 intermedio"
	else:
	lvl = "1 bajo"

	results.append((st, en, lvl, med, max_val))
	return results

	def merge_adjacent_segments(results, gap_threshold=0.3):

	if not results:
	return []

	merged = []
	cur_st, cur_en, cur_lvl, cur_med, cur_max = results[0]

	for st, en, lvl, med, mx in results[1:]:
	if lvl == cur_lvl and st - cur_en <= gap_threshold:
	cur_en = en
	cur_med = (cur_med + med) / 2
	cur_max = max(cur_max, mx)
	else:
	merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
	cur_st, cur_en, cur_lvl, cur_med, cur_max = st, en, lvl, med, mx

	merged.append((cur_st, cur_en, cur_lvl, cur_med, cur_max))
	return merged


	def shout(audio_path):

	if DEBUG_MODE:
	print(f"[MODEL LOADING] Loading shout model")

	y, sr = librosa.load(audio_path, sr=RESAMPLING_FREQ, mono=True)
	y = bandpass_filter(y, sr)

	rms, flux, rolloff, hnr, times = extract_features(y, sr)
	intensity = compute_intensity(rms, flux, rolloff, hnr)
	events, _ = segment_intensity(times, intensity, thr=0.18)
	results = assign_levels(events, intensity, times)
	results = merge_adjacent_segments(results, gap_threshold=1)

	results = [
	(st, en, lvl, med, max_val)
	for st, en, lvl, med, max_val in results
	if "elevado" in lvl or "gritando" in lvl

	]
	formatted = []
	for st, en, lvl, med, max_val in results:
	formatted.append(f"{sec_to_hhmmss(st)} – {sec_to_hhmmss(en)} \| volumen de voz: {lvl}")

	if not formatted:
	return "No se detectaron gritos o voces elevadas"

	return "\n".join(formatted)