mjuvilla commited on
Commit
ab66871
·
1 Parent(s): 85b75e6
Files changed (2) hide show
  1. src/aligner.py +1 -0
  2. src/translate_any_doc.py +48 -2
src/aligner.py CHANGED
@@ -7,6 +7,7 @@ from subprocess import Popen, PIPE
7
  # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
8
  class Aligner():
9
  def __init__(self, config_folder, source_lang, target_lang, temp_folder):
 
10
  forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
11
  reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
12
 
 
7
  # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
8
  class Aligner():
9
  def __init__(self, config_folder, source_lang, target_lang, temp_folder):
10
+ os.makedirs(temp_folder, exist_ok=True)
11
  forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
12
  reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
13
 
src/translate_any_doc.py CHANGED
@@ -24,6 +24,24 @@ if "sentencizer" not in spacy_nlp.pipe_names:
24
  spacy_nlp.add_pipe("sentencizer")
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
28
  original_xliff_file_path: str) -> str:
29
  """
@@ -192,6 +210,17 @@ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str,
192
  flat_tokens_with_style.append(item)
193
  flat_spaces_with_style.append(flat_spaces[token_idx])
194
  token_idx += 1
 
 
 
 
 
 
 
 
 
 
 
195
  elif flat_tokens[token_idx].startswith(run["text"]):
196
  subtoken = flat_tokens[token_idx][:len(run["text"])]
197
  item = run.copy()
@@ -200,7 +229,15 @@ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str,
200
  flat_spaces_with_style.append(False)
201
  flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
202
  run["text"] = run["text"][len(subtoken):]
203
-
 
 
 
 
 
 
 
 
204
  # reconstruct the sentences
205
  token_idx = 0
206
  tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
@@ -217,6 +254,16 @@ def tokenize_with_runs(runs: list[dict[str, str]]) -> tuple[list[list[dict[str,
217
  sentence_with_style.append(flat_tokens_with_style[token_idx])
218
  sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
219
  token_idx += 1
 
 
 
 
 
 
 
 
 
 
220
  else:
221
  print(token)
222
  print(sentence)
@@ -407,7 +454,6 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
407
  break
408
  except AppError as e:
409
  print(e)
410
- sys.exit()
411
 
412
  pbar.update(1)
413
  percent_complete = int(((i + 1) / total) * 100)
 
24
  spacy_nlp.add_pipe("sentencizer")
25
 
26
 
27
+ import unicodedata
28
+
29
+ def remove_invisible(text):
30
+ return ''.join(
31
+ c for c in text
32
+ if not unicodedata.category(c) in ['Zs', 'Cc', 'Cf']
33
+ )
34
+
35
+ def get_leading_invisible(text):
36
+ i = 0
37
+ while i < len(text):
38
+ c = text[i]
39
+ if unicodedata.category(c) in ['Zs', 'Cc', 'Cf']:
40
+ i += 1
41
+ else:
42
+ break
43
+ return text[:i]
44
+
45
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
46
  original_xliff_file_path: str) -> str:
47
  """
 
210
  flat_tokens_with_style.append(item)
211
  flat_spaces_with_style.append(flat_spaces[token_idx])
212
  token_idx += 1
213
+ elif remove_invisible(run["text"]).startswith(flat_tokens[token_idx]):
214
+
215
+ leading_invisible = get_leading_invisible(run["text"])
216
+ run["text"] = run["text"][len(leading_invisible + flat_tokens[token_idx]):]
217
+ if flat_spaces[token_idx]:
218
+ run["text"] = run["text"].lstrip()
219
+ item = run.copy()
220
+ item["text"] = leading_invisible + flat_tokens[token_idx]
221
+ flat_tokens_with_style.append(item)
222
+ flat_spaces_with_style.append(flat_spaces[token_idx])
223
+ token_idx += 1
224
  elif flat_tokens[token_idx].startswith(run["text"]):
225
  subtoken = flat_tokens[token_idx][:len(run["text"])]
226
  item = run.copy()
 
229
  flat_spaces_with_style.append(False)
230
  flat_tokens[token_idx] = flat_tokens[token_idx][len(run["text"]):]
231
  run["text"] = run["text"][len(subtoken):]
232
+ elif flat_tokens[token_idx].startswith(remove_invisible(run["text"])):
233
+ flat_tokens[token_idx] = flat_tokens[token_idx][len(remove_invisible(run["text"])):]
234
+ item = run.copy()
235
+ item["text"] = run["text"]
236
+ flat_tokens_with_style.append(item)
237
+ flat_spaces_with_style.append(flat_spaces[token_idx])
238
+ run["text"] = ""
239
+ else:
240
+ raise Exception(f"Something unexpected happened")
241
  # reconstruct the sentences
242
  token_idx = 0
243
  tokenized_sentences_with_style, tokenized_sentences_spaces_with_style = [], []
 
254
  sentence_with_style.append(flat_tokens_with_style[token_idx])
255
  sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
256
  token_idx += 1
257
+ elif token == remove_invisible(flat_tokens_with_style[token_idx]["text"]):
258
+ sentence_with_style.append(flat_tokens_with_style[token_idx])
259
+ sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
260
+ token_idx += 1
261
+ elif token.startswith(remove_invisible(flat_tokens_with_style[token_idx]["text"])):
262
+ while token:
263
+ token = token[len(remove_invisible(flat_tokens_with_style[token_idx]["text"])):]
264
+ sentence_with_style.append(remove_invisible(flat_tokens_with_style[token_idx]["text"]))
265
+ sentence_spaces_with_style.append(flat_spaces_with_style[token_idx])
266
+ token_idx += 1
267
  else:
268
  print(token)
269
  print(sentence)
 
454
  break
455
  except AppError as e:
456
  print(e)
 
457
 
458
  pbar.update(1)
459
  percent_complete = int(((i + 1) / total) * 100)