Spaces:
Sleeping
Sleeping
| import fileinput | |
| import os | |
| import platform | |
| from subprocess import Popen, PIPE | |
| # Class to align original and translated sentences | |
| # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py | |
| class Aligner(): | |
| def __init__(self, config_folder, source_lang, target_lang, temp_folder): | |
| forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params") | |
| reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params") | |
| fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err")) | |
| rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err")) | |
| self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align") | |
| self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align") | |
| if platform.system().lower() == "windows": | |
| fastalign_bin = "fast_align.exe" | |
| atools_bin = "atools.exe" | |
| else: | |
| fastalign_bin = "./fast_align" | |
| atools_bin = "./atools" | |
| self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt") | |
| self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f", | |
| forward_params_path] | |
| self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f", | |
| reverse_params_path, "r"] | |
| self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j", | |
| self.reverse_alignment_file_path, "-c", "grow-diag-final-and"] | |
| def __simplify_alignment_file(self, file): | |
| with fileinput.FileInput(file, inplace=True, backup='.bak') as f: | |
| for line in f: | |
| print(line.split('|||')[2].strip()) | |
| def __read_err(self, err): | |
| (T, m) = ('', '') | |
| for line in open(err): | |
| # expected target length = source length * N | |
| if 'expected target length' in line: | |
| m = line.split()[-1] | |
| # final tension: N | |
| elif 'final tension' in line: | |
| T = line.split()[-1] | |
| return T, m | |
| def align(self, original_sentences, translated_sentences): | |
| # create temporary file which fastalign will use | |
| with open(self.temp_file_path, "w") as temp_file: | |
| for original, translated in zip(original_sentences, translated_sentences): | |
| temp_file.write(f"{original} ||| {translated}\n") | |
| # generate forward alignment | |
| with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out: | |
| fw_process = Popen(self.forward_command, stdout=f_out) | |
| # generate reverse alignment | |
| r_process = Popen(self.reverse_command, stdout=r_out) | |
| # wait for both to finish | |
| fw_process.wait() | |
| r_process.wait() | |
| # for some reason the output file contains more information than needed, remove it | |
| self.__simplify_alignment_file(self.forward_alignment_file_path) | |
| self.__simplify_alignment_file(self.reverse_alignment_file_path) | |
| # generate symmetrical alignment | |
| process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) | |
| process.wait() | |
| # get final alignments and format them | |
| alignments_str = process.communicate()[0].decode('utf-8') | |
| alignments = [] | |
| for line in alignments_str.splitlines(): | |
| alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]]) | |
| return alignments | |