Spaces:
Sleeping
Sleeping
changed nltk tokenizer to multilingual tokenizers
Browse files- src/translate_any_doc.py +55 -19
src/translate_any_doc.py
CHANGED
|
@@ -7,14 +7,17 @@ import re
|
|
| 7 |
|
| 8 |
from src.aligner import Aligner
|
| 9 |
|
| 10 |
-
import nltk
|
| 11 |
import glob
|
| 12 |
-
from
|
| 13 |
-
import
|
| 14 |
|
| 15 |
-
|
| 16 |
-
nltk.download('punkt_tab')
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
| 20 |
original_xliff_file_path: str) -> str:
|
|
@@ -117,26 +120,53 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
|
|
| 117 |
|
| 118 |
return runs
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
"""
|
| 123 |
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
| 124 |
to its original run
|
| 125 |
|
| 126 |
Parameters:
|
| 127 |
runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
|
| 128 |
-
|
| 129 |
|
| 130 |
Returns:
|
| 131 |
list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
|
| 132 |
"""
|
|
|
|
|
|
|
| 133 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
| 134 |
-
sentences =
|
| 135 |
-
tokenized_sentences = [
|
| 136 |
|
|
|
|
| 137 |
tokens_with_style = []
|
| 138 |
for run in runs:
|
| 139 |
-
tokens =
|
| 140 |
if tokens:
|
| 141 |
for token in tokens:
|
| 142 |
tokens_with_style.append(run.copy())
|
|
@@ -144,6 +174,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
|
|
| 144 |
else:
|
| 145 |
tokens_with_style.append(run.copy())
|
| 146 |
|
|
|
|
| 147 |
token_index = 0
|
| 148 |
tokenized_sentences_with_style = []
|
| 149 |
for sentence in tokenized_sentences:
|
|
@@ -169,7 +200,7 @@ def tokenize_with_runs(runs: list[dict[str, str]], detokenizer) -> list[list[dic
|
|
| 169 |
|
| 170 |
def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
|
| 171 |
translated_paragraphs: list[str], aligner, temp_folder: str,
|
| 172 |
-
|
| 173 |
"""
|
| 174 |
Given some original paragraphs with style and formatting and its translation without formatting, try to match
|
| 175 |
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
|
@@ -181,7 +212,8 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
| 181 |
translated_paragraphs: Translated text, split into paragraphs
|
| 182 |
aligner: Object of the aligner class, uses fastalign
|
| 183 |
temp_folder: Path to folder where to put all the intermediate files
|
| 184 |
-
|
|
|
|
| 185 |
|
| 186 |
Returns:
|
| 187 |
list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
|
|
@@ -192,7 +224,7 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
| 192 |
os.remove(f)
|
| 193 |
|
| 194 |
# tokenize the original text by sentence and words while keeping the style
|
| 195 |
-
original_tokenized_sentences_with_style = [tokenize_with_runs(runs,
|
| 196 |
original_paragraphs_with_runs]
|
| 197 |
|
| 198 |
# flatten all the runs so we can align with just one call instead of one per paragraph
|
|
@@ -200,9 +232,9 @@ def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]
|
|
| 200 |
sublist]
|
| 201 |
|
| 202 |
# tokenize the translated text by sentence and word
|
| 203 |
-
translated_tokenized_sentences = [
|
| 204 |
translated_paragraph in translated_paragraphs for sentence in
|
| 205 |
-
|
| 206 |
|
| 207 |
assert len(translated_tokenized_sentences) == len(
|
| 208 |
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
|
|
@@ -329,7 +361,6 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
|
|
| 329 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
| 330 |
translator,
|
| 331 |
aligner: Aligner,
|
| 332 |
-
detokenizer,
|
| 333 |
temp_folder: str = "tmp",
|
| 334 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
| 335 |
input_filename = input_file.split("/")[-1]
|
|
@@ -340,6 +371,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
| 340 |
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
| 341 |
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
# get paragraphs with runs
|
| 344 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
| 345 |
enumerate(open(plain_text_file).readlines())]
|
|
@@ -347,21 +383,21 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
| 347 |
# translate using plaintext file
|
| 348 |
translated_paragraphs = []
|
| 349 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
| 350 |
-
paragraph_text =
|
| 351 |
translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
|
| 352 |
|
| 353 |
# time to align the translation with the original
|
| 354 |
print("Generating alignments...")
|
| 355 |
start_time = time.time()
|
| 356 |
translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
|
| 357 |
-
temp_folder,
|
| 358 |
print(f"Finished alignments in {time.time() - start_time} seconds")
|
| 359 |
|
| 360 |
# flatten the sentences into a list of tokens
|
| 361 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
| 362 |
|
| 363 |
# group the tokens by style/run
|
| 364 |
-
translated_runs_with_style = group_by_style(translated_tokens_with_style,
|
| 365 |
|
| 366 |
# group the runs by original paragraph
|
| 367 |
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|
|
|
|
| 7 |
|
| 8 |
from src.aligner import Aligner
|
| 9 |
|
|
|
|
| 10 |
import glob
|
| 11 |
+
from sacremoses import MosesTokenizer, MosesDetokenizer
|
| 12 |
+
import spacy
|
| 13 |
|
| 14 |
+
import tqdm
|
|
|
|
| 15 |
|
| 16 |
+
# Load multilingual model to use as sentence tokenizer
|
| 17 |
+
spacy_nlp = spacy.load("xx_ent_wiki_sm")
|
| 18 |
+
# Add the rule-based sentencizer
|
| 19 |
+
if "sentencizer" not in spacy_nlp.pipe_names:
|
| 20 |
+
spacy_nlp.add_pipe("sentencizer")
|
| 21 |
|
| 22 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
| 23 |
original_xliff_file_path: str) -> str:
|
|
|
|
| 120 |
|
| 121 |
return runs
|
| 122 |
|
| 123 |
+
def tokenize_text(text, tokenizer):
|
| 124 |
+
# To avoid the tokenizer destroying the url
|
| 125 |
+
def preserve_urls(text):
|
| 126 |
+
url_pattern = r'https?://[^\s\)\]\}\>]+|www\.[^\s\)\]\}\>]+'
|
| 127 |
+
# Find URLs using regex and replace them with a placeholder
|
| 128 |
+
urls = re.findall(url_pattern, text)
|
| 129 |
+
for idx, url in enumerate(urls):
|
| 130 |
+
placeholder = f"URL{idx}"
|
| 131 |
+
text = text.replace(url, placeholder)
|
| 132 |
+
|
| 133 |
+
return text, urls
|
| 134 |
+
|
| 135 |
+
# Replace URLs with placeholders
|
| 136 |
+
text, urls = preserve_urls(text)
|
| 137 |
+
|
| 138 |
+
# Tokenize using Sacremoses
|
| 139 |
+
tokens = tokenizer.tokenize(text)
|
| 140 |
+
|
| 141 |
+
# Revert placeholders back to original URLs
|
| 142 |
+
for idx, url in enumerate(urls):
|
| 143 |
+
placeholder = f"URL{idx}"
|
| 144 |
+
tokens = [token.replace(placeholder, url) for token in tokens]
|
| 145 |
|
| 146 |
+
return tokens
|
| 147 |
+
|
| 148 |
+
def tokenize_with_runs(runs: list[dict[str, str]], tokenizer, detokenizer) -> list[list[dict[str, str]]]:
|
| 149 |
"""
|
| 150 |
Given a list of runs, we need to tokenize them by sentence and token while keeping the style of each token according
|
| 151 |
to its original run
|
| 152 |
|
| 153 |
Parameters:
|
| 154 |
runs: List of runs, where each item is a chunk of text (possibly various tokens) and some style/formatting information
|
| 155 |
+
source_lang: Language of the document
|
| 156 |
|
| 157 |
Returns:
|
| 158 |
list[list[dict]]: A list of tokenized sentences where each token contains the style of its original run
|
| 159 |
"""
|
| 160 |
+
|
| 161 |
+
# it's a bit of a mess but first we get the tokenized sentences
|
| 162 |
text_paragraph = detokenizer.detokenize([run["text"] for run in runs])
|
| 163 |
+
sentences = spacy_nlp(text_paragraph).sents
|
| 164 |
+
tokenized_sentences = [tokenize_text(sentence.text, tokenizer) for sentence in sentences]
|
| 165 |
|
| 166 |
+
# then we assign a run/style to each token
|
| 167 |
tokens_with_style = []
|
| 168 |
for run in runs:
|
| 169 |
+
tokens = tokenize_text(run["text"], tokenizer)
|
| 170 |
if tokens:
|
| 171 |
for token in tokens:
|
| 172 |
tokens_with_style.append(run.copy())
|
|
|
|
| 174 |
else:
|
| 175 |
tokens_with_style.append(run.copy())
|
| 176 |
|
| 177 |
+
# and finally we combine both things, where each token of each sentence is assigned a run/style
|
| 178 |
token_index = 0
|
| 179 |
tokenized_sentences_with_style = []
|
| 180 |
for sentence in tokenized_sentences:
|
|
|
|
| 200 |
|
| 201 |
def generate_alignments(original_paragraphs_with_runs: list[list[dict[str, str]]],
|
| 202 |
translated_paragraphs: list[str], aligner, temp_folder: str,
|
| 203 |
+
source_tokenizer, source_detokenizer, target_tokenizer) -> list[list[dict[str, str]]]:
|
| 204 |
"""
|
| 205 |
Given some original paragraphs with style and formatting and its translation without formatting, try to match
|
| 206 |
the translated text formatting with the original. Since we only want to run fastalign once we have to temporarily
|
|
|
|
| 212 |
translated_paragraphs: Translated text, split into paragraphs
|
| 213 |
aligner: Object of the aligner class, uses fastalign
|
| 214 |
temp_folder: Path to folder where to put all the intermediate files
|
| 215 |
+
source_lang: original language of the document
|
| 216 |
+
target_lang: target language of the translation
|
| 217 |
|
| 218 |
Returns:
|
| 219 |
list[list[dict]]: A list of tokenized sentences where each translated token contains the style of the associated
|
|
|
|
| 224 |
os.remove(f)
|
| 225 |
|
| 226 |
# tokenize the original text by sentence and words while keeping the style
|
| 227 |
+
original_tokenized_sentences_with_style = [tokenize_with_runs(runs, source_tokenizer, source_detokenizer) for runs in
|
| 228 |
original_paragraphs_with_runs]
|
| 229 |
|
| 230 |
# flatten all the runs so we can align with just one call instead of one per paragraph
|
|
|
|
| 232 |
sublist]
|
| 233 |
|
| 234 |
# tokenize the translated text by sentence and word
|
| 235 |
+
translated_tokenized_sentences = [tokenize_text(sentence.text, target_tokenizer) for
|
| 236 |
translated_paragraph in translated_paragraphs for sentence in
|
| 237 |
+
spacy_nlp(translated_paragraph).sents]
|
| 238 |
|
| 239 |
assert len(translated_tokenized_sentences) == len(
|
| 240 |
original_tokenized_sentences_with_style), "The original and translated texts contain a different number of sentence, likely due to a translation error"
|
|
|
|
| 361 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
| 362 |
translator,
|
| 363 |
aligner: Aligner,
|
|
|
|
| 364 |
temp_folder: str = "tmp",
|
| 365 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
| 366 |
input_filename = input_file.split("/")[-1]
|
|
|
|
| 371 |
original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
|
| 372 |
plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
|
| 373 |
|
| 374 |
+
source_tokenizer = MosesTokenizer(lang=source_lang)
|
| 375 |
+
source_detokenizer = MosesDetokenizer(lang=source_lang)
|
| 376 |
+
target_tokenizer = MosesTokenizer(lang=target_lang)
|
| 377 |
+
target_detokenizer = MosesDetokenizer(lang=target_lang)
|
| 378 |
+
|
| 379 |
# get paragraphs with runs
|
| 380 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
| 381 |
enumerate(open(plain_text_file).readlines())]
|
|
|
|
| 383 |
# translate using plaintext file
|
| 384 |
translated_paragraphs = []
|
| 385 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
| 386 |
+
paragraph_text = source_detokenizer.detokenize([run["text"] for run in paragraph])
|
| 387 |
translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
|
| 388 |
|
| 389 |
# time to align the translation with the original
|
| 390 |
print("Generating alignments...")
|
| 391 |
start_time = time.time()
|
| 392 |
translated_sentences_with_style = generate_alignments(paragraphs_with_runs, translated_paragraphs, aligner,
|
| 393 |
+
temp_folder, source_tokenizer, source_detokenizer, target_tokenizer)
|
| 394 |
print(f"Finished alignments in {time.time() - start_time} seconds")
|
| 395 |
|
| 396 |
# flatten the sentences into a list of tokens
|
| 397 |
translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
|
| 398 |
|
| 399 |
# group the tokens by style/run
|
| 400 |
+
translated_runs_with_style = group_by_style(translated_tokens_with_style, target_detokenizer)
|
| 401 |
|
| 402 |
# group the runs by original paragraph
|
| 403 |
translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
|