File size: 2,291 Bytes
478eeb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# aligner.py

import os
import re
import tempfile
from typing import Dict, Any

# These imports are from your original script and are installed by your setup.sh
from aeneas.executetask import ExecuteTask
from aeneas.task import Task

def setup_aligner():
    """
    Aeneas does not require a model to be loaded, so this function does nothing.
    It exists to keep the structure of main.py consistent.
    """
    print("✅ Aeneas aligner is ready (no setup required).")
    pass

def align_words(audio_bytes: bytes, transcript: str) -> Dict[str, Any]:
    """
    Performs word alignment using the file-based aeneas library.
    This is run sequentially to ensure stability.
    """
    config = (
        "task_language=eng|"
        "is_text_type=plain|"
        "os_task_file_format=json|"
        "task_adjust_boundary_algorithm=percent|"
        "task_adjust_boundary_percent_value=30"
    )

    # Use a with statement to ensure temporary files are always cleaned up
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as af, \
         tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tf:
        
        # Write audio to a temporary file
        af.write(audio_bytes)
        audio_path = af.name
        
        # Write the formatted transcript to a temporary file
        words_only = re.findall(r"\b[a-zA-Z']+\b", transcript)
        formatted_transcript = "\n".join(words_only)
        tf.write(formatted_transcript)
        text_path = tf.name

    try:
        # Setup and run the aeneas alignment task
        task = Task(config_string=config)
        task.audio_file_path_absolute = audio_path
        task.text_file_path_absolute = text_path
        
        ExecuteTask(task).execute()

        # Extract the aligned words and start times
        words = []
        start_times = []
        if task.sync_map is not None:
            for fragment in task.sync_map.fragments:
                word = fragment.text.strip()
                if word:
                    words.append(word)
                    start_times.append(float(fragment.begin))
        
        return {"word": words, "startTime": start_times}

    finally:
        # Manually clean up the temporary files
        os.unlink(audio_path)
        os.unlink(text_path)