Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Aug 18

Commit

ab66871

1 Parent(s): 85b75e6

wip

Browse files

Files changed (2) hide show

src/aligner.py +1 -0
src/translate_any_doc.py +48 -2

src/aligner.py CHANGED Viewed

@@ -7,6 +7,7 @@ from subprocess import Popen, PIPE
 # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
 class Aligner():
     def __init__(self, config_folder, source_lang, target_lang, temp_folder):
         forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
         reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")

 # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
 class Aligner():
     def __init__(self, config_folder, source_lang, target_lang, temp_folder):
+        os.makedirs(temp_folder, exist_ok=True)
         forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
         reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")

src/translate_any_doc.py CHANGED Viewed

@@ -24,6 +24,24 @@ if "sentencizer" not in spacy_nlp.pipe_names:
     spacy_nlp.add_pipe("sentencizer")
 def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                       original_xliff_file_path: str) -> str:
     """
@@ -192,6 +210,17 @@ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str,
                 flat_tokens_with_style.append(item)
                 flat_spaces_with_style.append(flat_spaces[token_idx])
                 token_idx += 1
             elif flat_tokens[token_idx].startswith(run["text"]):
                 subtoken = flat_tokens[token_idx][:len(run["text"])]
                 item = run.copy()
@@ -200,7 +229,15 @@ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str,
                 flat_spaces_with_style.append(False)
                 flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
                 run["text"] = run["text"][len(subtoken):]
     # reconstruct the sentences
     token_idx = 0
     tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
@@ -217,6 +254,16 @@ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str,
                     sentence_with_style.append(flat_tokens_with_style[token_idx])
                     sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
                     token_idx += 1
             else:
                 print(token)
                 print(sentence)
@@ -407,7 +454,6 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
                 break
             except AppError as e:
                 print(e)
-                sys.exit()
         pbar.update(1)
         percent_complete = int(((i + 1) / total) * 100)

     spacy_nlp.add_pipe("sentencizer")
+import unicodedata
+def remove_invisible(text):
+    return ''.join(
+        c for c in text
+        if not unicodedata.category(c) in ['Zs', 'Cc', 'Cf']
+    )
+def get_leading_invisible(text):
+    i = 0
+    while i < len(text):
+        c = text[i]
+        if unicodedata.category(c) in ['Zs', 'Cc', 'Cf']:
+            i += 1
+        else:
+            break
+    return text[:i]
 def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                       original_xliff_file_path: str) -> str:
     """
                 flat_tokens_with_style.append(item)
                 flat_spaces_with_style.append(flat_spaces[token_idx])
                 token_idx += 1
+            elif remove_invisible(run["text"]).startswith(flat_tokens[token_idx]):
+                leading_invisible = get_leading_invisible(run["text"])
+                run["text"] = run["text"][len(leading_invisible + flat_tokens[token_idx]):]
+                if flat_spaces[token_idx]:
+                    run["text"] = run["text"].lstrip()
+                item = run.copy()
+                item["text"] = leading_invisible + flat_tokens[token_idx]
+                flat_tokens_with_style.append(item)
+                flat_spaces_with_style.append(flat_spaces[token_idx])
+                token_idx += 1
             elif flat_tokens[token_idx].startswith(run["text"]):
                 subtoken = flat_tokens[token_idx][:len(run["text"])]
                 item = run.copy()
                 flat_spaces_with_style.append(False)
                 flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
                 run["text"] = run["text"][len(subtoken):]
+            elif flat_tokens[token_idx].startswith(remove_invisible(run["text"])):
+                flat_tokens[token_idx] = flat_tokens[token_idx][len(remove_invisible(run["text"])):]
+                item = run.copy()
+                item["text"] = run["text"]
+                flat_tokens_with_style.append(item)
+                flat_spaces_with_style.append(flat_spaces[token_idx])
+                run["text"] = ""
+            else:
+                raise Exception(f"Something unexpected happened")
     # reconstruct the sentences
     token_idx = 0
     tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
                     sentence_with_style.append(flat_tokens_with_style[token_idx])
                     sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
                     token_idx += 1
+            elif token == remove_invisible(flat_tokens_with_style[token_idx]["text"]):
+                sentence_with_style.append(flat_tokens_with_style[token_idx])
+                sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
+                token_idx += 1
+            elif token.startswith(remove_invisible(flat_tokens_with_style[token_idx]["text"])):
+                while token:
+                    token = token[len(remove_invisible(flat_tokens_with_style[token_idx]["text"])):]
+                    sentence_with_style.append(remove_invisible(flat_tokens_with_style[token_idx]["text"]))
+                    sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
+                    token_idx += 1
             else:
                 print(token)
                 print(sentence)
                 break
             except AppError as e:
                 print(e)
         pbar.update(1)
         percent_complete = int(((i + 1) / total) * 100)