|
|
|
|
|
|
|
|
import os |
|
|
import re |
|
|
import tempfile |
|
|
from typing import Dict, Any |
|
|
|
|
|
|
|
|
from aeneas.executetask import ExecuteTask |
|
|
from aeneas.task import Task |
|
|
|
|
|
def setup_aligner(): |
|
|
""" |
|
|
Aeneas does not require a model to be loaded, so this function does nothing. |
|
|
It exists to keep the structure of main.py consistent. |
|
|
""" |
|
|
print("✅ Aeneas aligner is ready (no setup required).") |
|
|
pass |
|
|
|
|
|
def align_words(audio_bytes: bytes, transcript: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Performs word alignment using the file-based aeneas library. |
|
|
This is run sequentially to ensure stability. |
|
|
""" |
|
|
config = ( |
|
|
"task_language=eng|" |
|
|
"is_text_type=plain|" |
|
|
"os_task_file_format=json|" |
|
|
"task_adjust_boundary_algorithm=percent|" |
|
|
"task_adjust_boundary_percent_value=30" |
|
|
) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as af, \ |
|
|
tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tf: |
|
|
|
|
|
|
|
|
af.write(audio_bytes) |
|
|
audio_path = af.name |
|
|
|
|
|
|
|
|
words_only = re.findall(r"\b[a-zA-Z']+\b", transcript) |
|
|
formatted_transcript = "\n".join(words_only) |
|
|
tf.write(formatted_transcript) |
|
|
text_path = tf.name |
|
|
|
|
|
try: |
|
|
|
|
|
task = Task(config_string=config) |
|
|
task.audio_file_path_absolute = audio_path |
|
|
task.text_file_path_absolute = text_path |
|
|
|
|
|
ExecuteTask(task).execute() |
|
|
|
|
|
|
|
|
words = [] |
|
|
start_times = [] |
|
|
if task.sync_map is not None: |
|
|
for fragment in task.sync_map.fragments: |
|
|
word = fragment.text.strip() |
|
|
if word: |
|
|
words.append(word) |
|
|
start_times.append(float(fragment.begin)) |
|
|
|
|
|
return {"word": words, "startTime": start_times} |
|
|
|
|
|
finally: |
|
|
|
|
|
os.unlink(audio_path) |
|
|
os.unlink(text_path) |
|
|
|