Mohammed Zeeshan Parvez
feat: initialize ParlerVoice Hugging Face Space
4089011
import numpy as np
import soundfile as sf
def normalize_audio(audio: np.ndarray, target_level_db: float = -20.0) -> np.ndarray:
"""Normalize audio to a target RMS level in dB."""
rms = float(np.sqrt(np.mean(np.square(audio))))
if rms == 0.0:
return audio
target_linear = 10 ** (target_level_db / 20.0)
normalized = audio * (target_linear / rms)
max_val = float(np.max(np.abs(normalized)))
if max_val > 1.0:
normalized = normalized / max_val * 0.95
return normalized
def save_wav(path: str, audio: np.ndarray, samplerate: int) -> None:
"""Save audio as WAV file."""
sf.write(path, audio, samplerate=samplerate)
def shorten_long_silences(
audio: np.ndarray,
samplerate: int,
silence_threshold_db: float = -40.0,
max_silence_ms: int = 800,
collapse_trigger_ms: int = 2000,
) -> np.ndarray:
"""
Collapse continuous silences longer than collapse_trigger_ms down to max_silence_ms.
A simple amplitude-threshold based detector is used to find silent frames.
"""
if audio.size == 0:
return audio
# Compute frame-wise RMS in small windows (10ms) for robust silence detection
window_ms = 10
window = max(1, int(samplerate * window_ms / 1000))
if window <= 1:
window = 2
# Pad to multiple of window
pad = (window - (audio.shape[0] % window)) % window
if pad:
audio_padded = np.pad(audio, (0, pad), mode="constant")
else:
audio_padded = audio
frames = audio_padded.reshape(-1, window)
rms = np.sqrt(np.mean(frames ** 2, axis=1) + 1e-12)
rms_db = 20 * np.log10(np.maximum(rms, 1e-12))
silence_mask = rms_db < silence_threshold_db
# Find silent runs (in frames)
max_keep_frames = max(1, int(max_silence_ms / window_ms))
collapse_trigger_frames = max(1, int(collapse_trigger_ms / window_ms))
kept_frames = []
i = 0
total = silence_mask.shape[0]
while i < total:
if silence_mask[i]:
j = i
while j < total and silence_mask[j]:
j += 1
run = j - i
if run > collapse_trigger_frames:
kept_frames.extend([False] * max_keep_frames)
else:
kept_frames.extend([False] * run)
i = j
else:
kept_frames.append(True)
i += 1
kept_frames = np.array(kept_frames[: frames.shape[0]], dtype=bool)
# Reconstruct audio: keep non-silent frames fully; for silent frames, keep only first max_keep_frames
out_frames = []
i = 0
while i < frames.shape[0]:
if not silence_mask[i]:
out_frames.append(frames[i])
i += 1
else:
# Copy limited silent frames
j = i
while j < frames.shape[0] and silence_mask[j]:
j += 1
run = j - i
keep = min(run, collapse_trigger_frames, max_keep_frames)
for k in range(keep):
out_frames.append(frames[i + k])
i = j
out = np.concatenate(out_frames, axis=0)
# Trim the padding if added
return out[: max(0, out.shape[0] - 0)]