Spaces:
Sleeping
Sleeping
Added support for windows and linux, removed unused function, added more logs
Browse files- translate_docx.py +27 -47
translate_docx.py
CHANGED
|
@@ -10,6 +10,7 @@ from docx import Document
|
|
| 10 |
from docx.text.hyperlink import Hyperlink
|
| 11 |
from docx.text.run import Run
|
| 12 |
import nltk
|
|
|
|
| 13 |
|
| 14 |
nltk.download('punkt')
|
| 15 |
nltk.download('punkt_tab')
|
|
@@ -22,21 +23,22 @@ from subprocess import Popen, PIPE
|
|
| 22 |
from itertools import groupby
|
| 23 |
import fileinput
|
| 24 |
|
| 25 |
-
ip="192.168.20.216"
|
| 26 |
-
port="8000"
|
| 27 |
|
| 28 |
-
def translate(text, ip, port):
|
| 29 |
|
|
|
|
| 30 |
myobj = {
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
port = str(int(port))
|
| 35 |
url = 'http://' + ip + ':' + port + '/translate'
|
| 36 |
-
x = requests.post(url, json
|
| 37 |
json_response = json.loads(x.text)
|
| 38 |
return json_response['tgt']
|
| 39 |
|
|
|
|
| 40 |
# Class to align original and translated sentences
|
| 41 |
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
| 42 |
class Aligner():
|
|
@@ -50,12 +52,19 @@ class Aligner():
|
|
| 50 |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
| 51 |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
self.forward_command = lambda \
|
| 54 |
-
x: f'
|
| 55 |
self.reverse_command = lambda \
|
| 56 |
-
x: f'
|
| 57 |
|
| 58 |
-
self.symmetric_command = f'
|
| 59 |
|
| 60 |
def __simplify_alignment_file(self, file):
|
| 61 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
|
@@ -131,39 +140,6 @@ def extract_paragraphs_with_runs(doc):
|
|
| 131 |
return paragraphs_with_runs
|
| 132 |
|
| 133 |
|
| 134 |
-
def tokenize_paragraph_with_runs2(runs_in_paragraph):
|
| 135 |
-
text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
|
| 136 |
-
sentences = sent_tokenize(text_paragraph)
|
| 137 |
-
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
| 138 |
-
|
| 139 |
-
tokenized_sentences_with_style = []
|
| 140 |
-
for tokenized_sentence in tokenized_sentences:
|
| 141 |
-
tokenized_sentence_with_style = []
|
| 142 |
-
token_idx = 0
|
| 143 |
-
for run in runs_in_paragraph:
|
| 144 |
-
text_in_run = run["text"].strip()
|
| 145 |
-
|
| 146 |
-
if text_in_run == tokenized_sentence[token_idx]:
|
| 147 |
-
new_run = run.copy()
|
| 148 |
-
new_run["text"] = text_in_run
|
| 149 |
-
tokenized_sentence_with_style.append(new_run)
|
| 150 |
-
token_idx += 1
|
| 151 |
-
if token_idx >= len(tokenized_sentence):
|
| 152 |
-
break
|
| 153 |
-
elif len(text_in_run) > len(tokenized_sentence[token_idx]):
|
| 154 |
-
if text_in_run.startswith(tokenized_sentence[token_idx]):
|
| 155 |
-
for token in word_tokenize(text_in_run):
|
| 156 |
-
if token == tokenized_sentence[token_idx]:
|
| 157 |
-
new_run = run.copy()
|
| 158 |
-
new_run["text"] = token
|
| 159 |
-
tokenized_sentence_with_style.append(new_run)
|
| 160 |
-
token_idx += 1
|
| 161 |
-
else:
|
| 162 |
-
raise "oops"
|
| 163 |
-
tokenized_sentences_with_style.append(tokenized_sentence_with_style)
|
| 164 |
-
return tokenized_sentences_with_style
|
| 165 |
-
|
| 166 |
-
|
| 167 |
def tokenize_with_runs(runs, detokenizer):
|
| 168 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
| 169 |
sentences = sent_tokenize(text_paragraph)
|
|
@@ -194,7 +170,7 @@ def tokenize_with_runs(runs, detokenizer):
|
|
| 194 |
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
| 195 |
token_index += 1
|
| 196 |
else:
|
| 197 |
-
raise "
|
| 198 |
tokenized_sentences_with_style.append(sentence_with_style)
|
| 199 |
return tokenized_sentences_with_style
|
| 200 |
|
|
@@ -311,8 +287,7 @@ def preprocess_runs(runs_in_paragraph):
|
|
| 311 |
return new_runs
|
| 312 |
|
| 313 |
|
| 314 |
-
|
| 315 |
-
def translate_document(input_file,
|
| 316 |
aligner,
|
| 317 |
detokenizer,
|
| 318 |
ip="192.168.20.216",
|
|
@@ -322,7 +297,7 @@ def translate_document(input_file,
|
|
| 322 |
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
| 323 |
doc = Document(input_file)
|
| 324 |
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
| 325 |
-
|
| 326 |
# translate each paragraph
|
| 327 |
translated_paragraphs = []
|
| 328 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
|
@@ -333,13 +308,17 @@ def translate_document(input_file,
|
|
| 333 |
|
| 334 |
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
| 335 |
|
|
|
|
| 336 |
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
| 337 |
translated_paragraphs, aligner,
|
| 338 |
temp_folder, detokenizer)
|
|
|
|
|
|
|
| 339 |
# flatten the sentences into a list of tokens
|
| 340 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
| 341 |
# group the tokens by style/run
|
| 342 |
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
|
|
|
| 343 |
|
| 344 |
# group the runs by original paragraph
|
| 345 |
translated_paragraphs_with_style = defaultdict(list)
|
|
@@ -365,4 +344,5 @@ def translate_document(input_file,
|
|
| 365 |
run.font.color.rgb = item['font_color']
|
| 366 |
|
| 367 |
out_doc.save("translated.docx")
|
|
|
|
| 368 |
return "translated.docx"
|
|
|
|
| 10 |
from docx.text.hyperlink import Hyperlink
|
| 11 |
from docx.text.run import Run
|
| 12 |
import nltk
|
| 13 |
+
import platform
|
| 14 |
|
| 15 |
nltk.download('punkt')
|
| 16 |
nltk.download('punkt_tab')
|
|
|
|
| 23 |
from itertools import groupby
|
| 24 |
import fileinput
|
| 25 |
|
| 26 |
+
ip = "192.168.20.216"
|
| 27 |
+
port = "8000"
|
| 28 |
|
|
|
|
| 29 |
|
| 30 |
+
def translate(text, ip, port):
|
| 31 |
myobj = {
|
| 32 |
+
'id': '1',
|
| 33 |
+
'src': text,
|
| 34 |
+
}
|
| 35 |
port = str(int(port))
|
| 36 |
url = 'http://' + ip + ':' + port + '/translate'
|
| 37 |
+
x = requests.post(url, json=myobj)
|
| 38 |
json_response = json.loads(x.text)
|
| 39 |
return json_response['tgt']
|
| 40 |
|
| 41 |
+
|
| 42 |
# Class to align original and translated sentences
|
| 43 |
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
| 44 |
class Aligner():
|
|
|
|
| 52 |
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
| 53 |
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
| 54 |
|
| 55 |
+
if platform.system().lower() == "windows":
|
| 56 |
+
fastalign_bin = "fast_align.exe"
|
| 57 |
+
atools_bin = "atools.exe"
|
| 58 |
+
else:
|
| 59 |
+
fastalign_bin = "./fast_align"
|
| 60 |
+
atools_bin = "./atools"
|
| 61 |
+
|
| 62 |
self.forward_command = lambda \
|
| 63 |
+
x: f'{fastalign_bin} -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
|
| 64 |
self.reverse_command = lambda \
|
| 65 |
+
x: f'{fastalign_bin} -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
|
| 66 |
|
| 67 |
+
self.symmetric_command = f'{atools_bin} -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
|
| 68 |
|
| 69 |
def __simplify_alignment_file(self, file):
|
| 70 |
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
|
|
|
| 140 |
return paragraphs_with_runs
|
| 141 |
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
def tokenize_with_runs(runs, detokenizer):
|
| 144 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
| 145 |
sentences = sent_tokenize(text_paragraph)
|
|
|
|
| 170 |
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
| 171 |
token_index += 1
|
| 172 |
else:
|
| 173 |
+
raise "Something unexpected happened I'm afraid"
|
| 174 |
tokenized_sentences_with_style.append(sentence_with_style)
|
| 175 |
return tokenized_sentences_with_style
|
| 176 |
|
|
|
|
| 287 |
return new_runs
|
| 288 |
|
| 289 |
|
| 290 |
+
def translate_document(input_file,
|
|
|
|
| 291 |
aligner,
|
| 292 |
detokenizer,
|
| 293 |
ip="192.168.20.216",
|
|
|
|
| 297 |
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
| 298 |
doc = Document(input_file)
|
| 299 |
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
| 300 |
+
|
| 301 |
# translate each paragraph
|
| 302 |
translated_paragraphs = []
|
| 303 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
|
|
|
| 308 |
|
| 309 |
processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
|
| 310 |
|
| 311 |
+
print("Generating alignments...")
|
| 312 |
translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
|
| 313 |
translated_paragraphs, aligner,
|
| 314 |
temp_folder, detokenizer)
|
| 315 |
+
print("Finished alignments")
|
| 316 |
+
|
| 317 |
# flatten the sentences into a list of tokens
|
| 318 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
| 319 |
# group the tokens by style/run
|
| 320 |
translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
|
| 321 |
+
print("Grouped by style")
|
| 322 |
|
| 323 |
# group the runs by original paragraph
|
| 324 |
translated_paragraphs_with_style = defaultdict(list)
|
|
|
|
| 344 |
run.font.color.rgb = item['font_color']
|
| 345 |
|
| 346 |
out_doc.save("translated.docx")
|
| 347 |
+
print("Saved file")
|
| 348 |
return "translated.docx"
|