Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 25

Commit

580106a

unverified ·

2 Parent(s): fd61039 36f2ac1

Merge pull request #2 from langtech-bsc/multithreading-and-optimizations

Browse files

Files changed (1) hide show

translate_docx.py +42 -28

translate_docx.py CHANGED Viewed

@@ -3,8 +3,6 @@ import json
 import requests
 import tqdm
 import os
-import string
-from collections import defaultdict
 from docx import Document
 from docx.text.hyperlink import Hyperlink
@@ -16,7 +14,6 @@ nltk.download('punkt')
 nltk.download('punkt_tab')
 from nltk.tokenize import sent_tokenize, word_tokenize
-from nltk.tokenize.treebank import TreebankWordDetokenizer
 from subprocess import Popen, PIPE
@@ -59,12 +56,15 @@ class Aligner():
             fastalign_bin = "./fast_align"
             atools_bin = "./atools"
-        self.forward_command = lambda \
-                x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
-        self.reverse_command = lambda \
-                x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
-        self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
     def __simplify_alignment_file(self, file):
         with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
@@ -82,20 +82,28 @@ class Aligner():
                 T = line.split()[-1]
         return T, m
-    def align(self, file):
         # generate forward alignment
-        process = Popen(self.forward_command(file), shell=True)
-        process.wait()
-        # generate reverse alignment
-        process = Popen(self.reverse_command(file), shell=True)
-        process.wait()
         # for some reason the output file contains more information than needed, remove it
         self.__simplify_alignment_file(self.forward_alignment_file_path)
         self.__simplify_alignment_file(self.reverse_alignment_file_path)
         # generate symmetrical alignment
-        process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
         process.wait()
         # get final alignments and format them
@@ -180,8 +188,6 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
     for f in os.listdir(temp_folder):
         os.remove(os.path.join(temp_folder, f))
-    temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
     # tokenize the original text by sentence and words while keeping the style
     original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
                                                original_paragraphs_with_runs]
@@ -194,13 +200,13 @@ def generate_alignments(original_paragraphs_with_runs, translated_paragraphs, al
     translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
                                       sent_tokenize(translated_paragraph)]
-    # write the file that fastalign will use
-    with open(temp_file_path, "w") as out_file:
-        for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
-            out_file.write(f"{' '.join(item['text'] for item in original)} ||| {' '.join(translated)}\n")
-    alignments = aligner.align(temp_file_path)
     # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
     translated_sentences_with_style = []
@@ -238,7 +244,7 @@ def group_by_style(values, detokenizer):
             x['paragraph_index'])):
         text = detokenizer.detokenize([item['text'] for item in group])
-        if groups and not text.startswith((",", ";", ":", ".", ")")):
             text = " " + text
         groups.append({"text": text,
@@ -309,21 +315,29 @@ def translate_document(input_file,
     processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
     print("Generating alignments...")
     translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
                                                           translated_paragraphs, aligner,
                                                           temp_folder, detokenizer)
-    print("Finished alignments")
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
     translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
-    print("Grouped by style")
     # group the runs by original paragraph
-    translated_paragraphs_with_style = defaultdict(list)
     for item in translated_runs_with_style:
-        translated_paragraphs_with_style[item['paragraph_index']].append(item)
     for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
         # in case there are empty paragraphs

 import requests
 import tqdm
 import os
 from docx import Document
 from docx.text.hyperlink import Hyperlink
 nltk.download('punkt_tab')
 from nltk.tokenize import sent_tokenize, word_tokenize
 from subprocess import Popen, PIPE
             fastalign_bin = "./fast_align"
             atools_bin = "./atools"
+        self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
+        self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f",
+                                forward_params_path]
+        self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f",
+                                reverse_params_path, "r"]
+        self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j",
+                                  self.reverse_alignment_file_path, "-c", "grow-diag-final-and"]
     def __simplify_alignment_file(self, file):
         with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
                 T = line.split()[-1]
         return T, m
+    def align(self, original_sentences, translated_sentences):
+        # create temporary file which fastalign will use
+        with open(self.temp_file_path, "w") as temp_file:
+            for original, translated in zip(original_sentences, translated_sentences):
+                temp_file.write(f"{original} ||| {translated}\n")
         # generate forward alignment
+        with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out:
+            fw_process = Popen(self.forward_command, stdout=f_out)
+            # generate reverse alignment
+            r_process = Popen(self.reverse_command, stdout=r_out)
+            # wait for both to finish
+            fw_process.wait()
+            r_process.wait()
         # for some reason the output file contains more information than needed, remove it
         self.__simplify_alignment_file(self.forward_alignment_file_path)
         self.__simplify_alignment_file(self.reverse_alignment_file_path)
         # generate symmetrical alignment
+        process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
         process.wait()
         # get final alignments and format them
     for f in os.listdir(temp_folder):
         os.remove(os.path.join(temp_folder, f))
     # tokenize the original text by sentence and words while keeping the style
     original_tokenized_sentences_with_style = [tokenize_with_runs(runs, detokenizer) for runs in
                                                original_paragraphs_with_runs]
     translated_tokenized_sentences = [word_tokenize(sentence) for
                                       translated_paragraph in translated_paragraphs for sentence in
                                       sent_tokenize(translated_paragraph)]
+    original_sentences = []
+    translated_sentences = []
+    for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
+        original_sentences.append(' '.join(item['text'] for item in original))
+        translated_sentences.append(' '.join(translated))
+    alignments = aligner.align(original_sentences, translated_sentences)
     # using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
     translated_sentences_with_style = []
             x['paragraph_index'])):
         text = detokenizer.detokenize([item['text'] for item in group])
+        if groups and not text.startswith((",", ";", ":", ".", ")", "!", "?")):
             text = " " + text
         groups.append({"text": text,
     processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
     print("Generating alignments...")
+    start_time = time.time()
     translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
                                                           translated_paragraphs, aligner,
                                                           temp_folder, detokenizer)
+    print(f"Finished alignments in {time.time() - start_time} seconds")
     # flatten the sentences into a list of tokens
     translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
     # group the tokens by style/run
     translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
     # group the runs by original paragraph
+    translated_paragraphs_with_style = dict()
     for item in translated_runs_with_style:
+        if item['paragraph_index'] in translated_paragraphs_with_style:
+            translated_paragraphs_with_style[item['paragraph_index']].append(item)
+        else:
+            # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
+            # didn't know where paragraphs started and ended
+            first_item_in_paragraph = item.copy()
+            first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
+            translated_paragraphs_with_style[item['paragraph_index']] = []
+            translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
     for paragraph_index, original_paragraph in enumerate(doc.paragraphs):
         # in case there are empty paragraphs