miko-tts / aligner.py
zirobtc's picture
Uploading DART folder into model repo
478eeb0 verified
# aligner.py
import os
import re
import tempfile
from typing import Dict, Any
# These imports are from your original script and are installed by your setup.sh
from aeneas.executetask import ExecuteTask
from aeneas.task import Task
def setup_aligner():
"""
Aeneas does not require a model to be loaded, so this function does nothing.
It exists to keep the structure of main.py consistent.
"""
print("✅ Aeneas aligner is ready (no setup required).")
pass
def align_words(audio_bytes: bytes, transcript: str) -> Dict[str, Any]:
"""
Performs word alignment using the file-based aeneas library.
This is run sequentially to ensure stability.
"""
config = (
"task_language=eng|"
"is_text_type=plain|"
"os_task_file_format=json|"
"task_adjust_boundary_algorithm=percent|"
"task_adjust_boundary_percent_value=30"
)
# Use a with statement to ensure temporary files are always cleaned up
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as af, \
tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tf:
# Write audio to a temporary file
af.write(audio_bytes)
audio_path = af.name
# Write the formatted transcript to a temporary file
words_only = re.findall(r"\b[a-zA-Z']+\b", transcript)
formatted_transcript = "\n".join(words_only)
tf.write(formatted_transcript)
text_path = tf.name
try:
# Setup and run the aeneas alignment task
task = Task(config_string=config)
task.audio_file_path_absolute = audio_path
task.text_file_path_absolute = text_path
ExecuteTask(task).execute()
# Extract the aligned words and start times
words = []
start_times = []
if task.sync_map is not None:
for fragment in task.sync_map.fragments:
word = fragment.text.strip()
if word:
words.append(word)
start_times.append(float(fragment.begin))
return {"word": words, "startTime": start_times}
finally:
# Manually clean up the temporary files
os.unlink(audio_path)
os.unlink(text_path)