zirobtc
/

miko-tts

your-custom-library-name

voice-synthesis

Model card Files Files and versions

miko-tts / aligner.py

zirobtc's picture

Uploading DART folder into model repo

478eeb0 verified 4 months ago

history blame contribute delete

2.29 kB

	# aligner.py

	import os
	import re
	import tempfile
	from typing import Dict, Any

	# These imports are from your original script and are installed by your setup.sh
	from aeneas.executetask import ExecuteTask
	from aeneas.task import Task

	def setup_aligner():
	"""
	Aeneas does not require a model to be loaded, so this function does nothing.
	It exists to keep the structure of main.py consistent.
	"""
	print("✅ Aeneas aligner is ready (no setup required).")
	pass

	def align_words(audio_bytes: bytes, transcript: str) -> Dict[str, Any]:
	"""
	Performs word alignment using the file-based aeneas library.
	This is run sequentially to ensure stability.
	"""
	config = (
	"task_language=eng\|"
	"is_text_type=plain\|"
	"os_task_file_format=json\|"
	"task_adjust_boundary_algorithm=percent\|"
	"task_adjust_boundary_percent_value=30"
	)

	# Use a with statement to ensure temporary files are always cleaned up
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as af, \
	tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tf:

	# Write audio to a temporary file
	af.write(audio_bytes)
	audio_path = af.name

	# Write the formatted transcript to a temporary file
	words_only = re.findall(r"\b[a-zA-Z']+\b", transcript)
	formatted_transcript = "\n".join(words_only)
	tf.write(formatted_transcript)
	text_path = tf.name

	try:
	# Setup and run the aeneas alignment task
	task = Task(config_string=config)
	task.audio_file_path_absolute = audio_path
	task.text_file_path_absolute = text_path

	ExecuteTask(task).execute()

	# Extract the aligned words and start times
	words = []
	start_times = []
	if task.sync_map is not None:
	for fragment in task.sync_map.fragments:
	word = fragment.text.strip()
	if word:
	words.append(word)
	start_times.append(float(fragment.begin))

	return {"word": words, "startTime": start_times}

	finally:
	# Manually clean up the temporary files
	os.unlink(audio_path)
	os.unlink(text_path)