mjuvilla commited on
Commit
0142045
·
1 Parent(s): 80b995d

separated the translator that uses the HF endpoint

Browse files
Dockerfile CHANGED
@@ -1,16 +1,4 @@
1
  FROM python:3.12-slim
2
- ## Set up a new user named "user" with user ID 1000
3
- #RUN useradd -m -u 1000 user
4
- #
5
- ## Switch to the "user" user
6
- #USER user
7
- #
8
- ## Set home to the user's home directory
9
- #ENV HOME=/home/user \
10
- # PATH=/home/user/.local/bin:$PATH
11
-
12
- ## Set the working directory to the user's home directory
13
- #WORKDIR $HOME/app
14
 
15
  COPY src ./src
16
  COPY scripts ./scripts
 
1
  FROM python:3.12-slim
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  COPY src ./src
4
  COPY scripts ./scripts
gradio_app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
- from src.translate_any_doc import translate_document
3
- from src.salamandraTA7b_translator import SalamandraTA7bTranslator
4
  from src.aligner import Aligner
5
  import os
6
 
@@ -12,9 +11,9 @@ def upload_file(filepath, source_lang, target_lang, user_token):
12
  hf_token = os.environ.get('HF_TOKEN')
13
  if user_token:
14
  hf_token = user_token
15
- translator = SalamandraTA7bTranslator(hf_token)
16
  aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
17
- for status, translated_file_name in translate_document(filepath, source_lang, target_lang, translator, aligner):
18
  if translated_file_name: # finished
19
  yield [gr.UploadButton(visible=False),
20
  gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name,
 
1
  import gradio as gr
2
+ from src.salamandraTA7b_translator_HF import SalamandraTA7bTranslatorHF
 
3
  from src.aligner import Aligner
4
  import os
5
 
 
11
  hf_token = os.environ.get('HF_TOKEN')
12
  if user_token:
13
  hf_token = user_token
14
+ translator = SalamandraTA7bTranslatorHF(hf_token)
15
  aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
16
+ for status, translated_file_name in translator.translate_document(filepath, source_lang, target_lang, aligner):
17
  if translated_file_name: # finished
18
  yield [gr.UploadButton(visible=False),
19
  gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name,
requirements.txt CHANGED
@@ -1,8 +1,12 @@
1
- iso-639~=0.4.5
2
- protobuf~=6.30.2
3
  requests~=2.32.3
4
  tqdm~=4.67.1
5
  gradio~=5.25.1
 
6
  gradio_client~=1.8.0
7
- setuptools~=80.0.0
8
- spacy~=3.8.6
 
 
 
 
 
 
 
 
1
  requests~=2.32.3
2
  tqdm~=4.67.1
3
  gradio~=5.25.1
4
+ spacy~=3.8.6
5
  gradio_client~=1.8.0
6
+ pandas~=2.3.3
7
+ beautifulsoup4~=4.14.2
8
+ gradio_client~=1.8.0
9
+ transformers~=4.57.1
10
+ torch~=2.8.0
11
+ huggingface-hub~=0.36.0
12
+ vllm~=0.11.0
src/{translate_any_doc.py → salamandraTA7b_translator_HF.py} RENAMED
@@ -1,6 +1,6 @@
1
  import shutil
2
  import string
3
- import sys
4
  import time
5
  import os
6
  from itertools import groupby
@@ -23,15 +23,154 @@ spacy_nlp = spacy.load("xx_ent_wiki_sm")
23
  if "sentencizer" not in spacy_nlp.pipe_names:
24
  spacy_nlp.add_pipe("sentencizer")
25
 
26
-
27
  import unicodedata
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def remove_invisible(text):
30
  return ''.join(
31
  c for c in text
32
  if not unicodedata.category(c) in ['Zs', 'Cc', 'Cf']
33
  )
34
 
 
35
  def get_leading_invisible(text):
36
  i = 0
37
  while i < len(text):
@@ -42,6 +181,7 @@ def get_leading_invisible(text):
42
  break
43
  return text[:i]
44
 
 
45
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
46
  original_xliff_file_path: str) -> str:
47
  """
@@ -118,7 +258,7 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
118
  # Self-closing tag like <x id="1"/>
119
  if tag_id is None:
120
  tag_id = -1
121
- #raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
122
  runs.append({
123
  "text": "",
124
  "id": [f"{tag_name}_{tag_id}"],
@@ -128,7 +268,7 @@ def get_runs_from_paragraph(paragraph: str, paragraph_index: int) -> list[dict[s
128
  # Opening tag <tag id="...">
129
  if tag_id is None:
130
  tag_id = -1
131
- #raise ValueError(f"Opening tag <{tag_name}> missing id")
132
  tag_stack.append(f"{tag_name}_{tag_id}")
133
 
134
  pos = end
@@ -419,120 +559,3 @@ def runs_to_plain_text(paragraphs_with_style: dict[int, list[dict[str, str, str]
419
  out_file.write("".join(run["text"]))
420
 
421
  out_file.write("\n")
422
-
423
-
424
- def translate_document(input_file: str, source_lang: str, target_lang: str,
425
- translator,
426
- aligner: Aligner,
427
- temp_folder: str = "tmp",
428
- tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0", with_format: bool = True) -> (str,
429
- str):
430
- input_filename = input_file.split("/")[-1]
431
- os.makedirs(temp_folder, exist_ok=True)
432
-
433
- # copy the original file to the temporal folder to avoid common issues with tikal
434
- temp_input_file = os.path.join(temp_folder, input_filename)
435
- shutil.copy(input_file, temp_input_file)
436
-
437
- original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
438
- plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder, original_xliff_file)
439
-
440
- # get paragraphs with runs
441
- paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
442
- enumerate(open(plain_text_file).readlines())]
443
-
444
- # translate using plaintext file
445
- original_tokenized_sentences_with_style = []
446
- original_spacing = []
447
- for run in paragraphs_with_runs:
448
- tokens, spaces = tokenize_with_runs(run)
449
- original_tokenized_sentences_with_style += tokens
450
- original_spacing += spaces
451
-
452
- translated_sentences = []
453
- yield "Translating 0%...", None
454
- total = len(original_tokenized_sentences_with_style)
455
- pbar = tqdm.tqdm(desc="Translating paragraphs...", total=total)
456
-
457
- for i, (sentence, spacing) in enumerate(zip(original_tokenized_sentences_with_style, original_spacing)):
458
- text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
459
-
460
- while True:
461
- try:
462
- translated_sentences.append(translator.translate(text, source_lang, target_lang))
463
- break
464
- except AppError as e:
465
- print(e)
466
-
467
- pbar.update(1)
468
- percent_complete = int(((i + 1) / total) * 100)
469
- yield f"Translating {percent_complete}%...", None
470
-
471
- # time to align the translation with the original
472
- print("Generating alignments...")
473
- yield "Aligning...", None
474
- start_time = time.time()
475
- translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
476
- original_tokenized_sentences_with_style,
477
- translated_sentences, aligner,
478
- temp_folder)
479
- print(f"Finished alignments in {time.time() - start_time} seconds")
480
-
481
- # since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
482
- # at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
483
- # right after
484
- for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
485
- if sentence[-1]["text"] in string.punctuation:
486
- sentence_spaces[-1] = True
487
-
488
- # flatten the sentences into a list of tokens
489
- translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
490
- tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
491
-
492
- # group the tokens by style/run
493
- translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
494
-
495
- # group the runs by original paragraph
496
- translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
497
- range(len(paragraphs_with_runs))}
498
-
499
- for item in translated_runs_with_style:
500
- # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
501
- # didn't know where paragraphs started and ended
502
- if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
503
- first_item_in_paragraph = item.copy()
504
- first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
505
- translated_paragraphs_with_style[item['paragraph_index']] = []
506
- translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
507
- else:
508
- translated_paragraphs_with_style[item['paragraph_index']].append(item)
509
-
510
- # save to new plain text file
511
- translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
512
- runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
513
-
514
- # put the translations into the xlf
515
- tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file, "-sl",
516
- source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
517
- "-noalttrans", "-to", original_xliff_file]
518
- Popen(tikal_moses_to_xliff_command).wait()
519
-
520
- # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
521
- # them. This may happen if a word in the original language has been split in more that one words that have other
522
- # words in between, or an error in fastalign
523
- text = open(original_xliff_file).read()
524
- result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
525
- open(original_xliff_file, "w").write(result)
526
-
527
- # merge into a docx again
528
- tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
529
- final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
530
- stdout, stderr = final_process.communicate()
531
- final_process.wait()
532
-
533
- # get the path to the output file
534
- output = stdout.decode('utf-8')
535
- translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
536
-
537
- print(f"Saved file in {translated_file_path}")
538
- yield "", translated_file_path
 
1
  import shutil
2
  import string
3
+ from iso639 import languages
4
  import time
5
  import os
6
  from itertools import groupby
 
23
  if "sentencizer" not in spacy_nlp.pipe_names:
24
  spacy_nlp.add_pipe("sentencizer")
25
 
 
26
  import unicodedata
27
 
28
+
29
+ class SalamandraTA7bTranslatorHF:
30
+ def __init__(self, hf_token):
31
+ from gradio_client import Client
32
+ self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=hf_token)
33
+
34
+ def translate(self, text, source_lang, target_lang):
35
+ if not text:
36
+ return ""
37
+
38
+ # we assume that they are specifying the language by code so we need to convert it to name
39
+ lang1 = languages.get(alpha2=source_lang).name
40
+ lang2 = languages.get(alpha2=target_lang).name
41
+ result = self.client.predict(
42
+ task="Translation",
43
+ source=lang1,
44
+ target=lang2,
45
+ input_text=text,
46
+ mt_text=None,
47
+ api_name="/generate_output"
48
+ )
49
+ return result[0]
50
+
51
+ def translate_document(self, input_file: str, source_lang: str, target_lang: str,
52
+ aligner: Aligner,
53
+ temp_folder: str = "tmp",
54
+ tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> (str, str):
55
+ input_filename = input_file.split("/")[-1]
56
+ os.makedirs(temp_folder, exist_ok=True)
57
+
58
+ # copy the original file to the temporal folder to avoid common issues with tikal
59
+ temp_input_file = os.path.join(temp_folder, input_filename)
60
+ shutil.copy(input_file, temp_input_file)
61
+
62
+ original_xliff_file = os.path.join(temp_folder, input_filename + ".xlf")
63
+ plain_text_file = doc_to_plain_text(temp_input_file, source_lang, target_lang, tikal_folder,
64
+ original_xliff_file)
65
+
66
+ # get paragraphs with runs
67
+ paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
68
+ enumerate(open(plain_text_file).readlines())]
69
+
70
+ # translate using plaintext file
71
+ original_tokenized_sentences_with_style = []
72
+ original_spacing = []
73
+ for run in paragraphs_with_runs:
74
+ tokens, spaces = tokenize_with_runs(run)
75
+ original_tokenized_sentences_with_style += tokens
76
+ original_spacing += spaces
77
+
78
+ translated_sentences = []
79
+ yield "Translating 0%...", None
80
+ total = len(original_tokenized_sentences_with_style)
81
+ pbar = tqdm.tqdm(desc="Translating paragraphs...", total=total)
82
+
83
+ for i, (sentence, spacing) in enumerate(zip(original_tokenized_sentences_with_style, original_spacing)):
84
+ text = Doc(spacy_nlp.vocab, words=[token["text"] for token in sentence], spaces=spacing).text
85
+
86
+ while True:
87
+ try:
88
+ translated_sentences.append(self.translate(text, source_lang, target_lang))
89
+ break
90
+ except AppError as e:
91
+ print(e)
92
+
93
+ pbar.update(1)
94
+ percent_complete = int(((i + 1) / total) * 100)
95
+ yield f"Translating {percent_complete}%...", None
96
+
97
+ # time to align the translation with the original
98
+ print("Generating alignments...")
99
+ yield "Aligning...", None
100
+ start_time = time.time()
101
+ translated_sentences_with_style, translated_sentences_spacing = generate_alignments(
102
+ original_tokenized_sentences_with_style,
103
+ translated_sentences, aligner,
104
+ temp_folder)
105
+ print(f"Finished alignments in {time.time() - start_time} seconds")
106
+
107
+ # since we tokenized these sentences independently, the spacing information does not contain spaces after punctuation
108
+ # at the end of the sentence (there's no space at the end of a sentence that ends with ".", unless there's a sentence
109
+ # right after
110
+ for sentence, sentence_spaces in zip(translated_sentences_with_style, translated_sentences_spacing):
111
+ if sentence[-1]["text"] in string.punctuation:
112
+ sentence_spaces[-1] = True
113
+
114
+ # flatten the sentences into a list of tokens
115
+ translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
116
+ tokens_spaces = [item for sublist in translated_sentences_spacing for item in sublist]
117
+
118
+ # group the tokens by style/run
119
+ translated_runs_with_style = group_by_style(translated_tokens_with_style, tokens_spaces)
120
+
121
+ # group the runs by original paragraph
122
+ translated_paragraphs_with_style = {key: [{'id': None, 'paragraph_index': key, 'text': ""}] for key in
123
+ range(len(paragraphs_with_runs))}
124
+
125
+ for item in translated_runs_with_style:
126
+ # first item in the paragraph, remove starting blank space we introduced in group_by_style(), where we
127
+ # didn't know where paragraphs started and ended
128
+ if not translated_paragraphs_with_style[item['paragraph_index']][0]["text"]:
129
+ first_item_in_paragraph = item.copy()
130
+ first_item_in_paragraph["text"] = first_item_in_paragraph["text"].lstrip(" ")
131
+ translated_paragraphs_with_style[item['paragraph_index']] = []
132
+ translated_paragraphs_with_style[item['paragraph_index']].append(first_item_in_paragraph)
133
+ else:
134
+ translated_paragraphs_with_style[item['paragraph_index']].append(item)
135
+
136
+ # save to new plain text file
137
+ translated_moses_file = os.path.join(original_xliff_file + f".{target_lang}")
138
+ runs_to_plain_text(translated_paragraphs_with_style, translated_moses_file)
139
+
140
+ # put the translations into the xlf
141
+ tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file, "-sl",
142
+ source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
143
+ "-noalttrans", "-to", original_xliff_file]
144
+ Popen(tikal_moses_to_xliff_command).wait()
145
+
146
+ # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
147
+ # them. This may happen if a word in the original language has been split in more that one words that have other
148
+ # words in between, or an error in fastalign
149
+ text = open(original_xliff_file).read()
150
+ result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
151
+ open(original_xliff_file, "w").write(result)
152
+
153
+ # merge into a docx again
154
+ tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file]
155
+ final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
156
+ stdout, stderr = final_process.communicate()
157
+ final_process.wait()
158
+
159
+ # get the path to the output file
160
+ output = stdout.decode('utf-8')
161
+ translated_file_path = re.search(r'(?<=Output:\s)(.*)', output)[0]
162
+
163
+ print(f"Saved file in {translated_file_path}")
164
+ yield "", translated_file_path
165
+
166
+
167
  def remove_invisible(text):
168
  return ''.join(
169
  c for c in text
170
  if not unicodedata.category(c) in ['Zs', 'Cc', 'Cf']
171
  )
172
 
173
+
174
  def get_leading_invisible(text):
175
  i = 0
176
  while i < len(text):
 
181
  break
182
  return text[:i]
183
 
184
+
185
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
186
  original_xliff_file_path: str) -> str:
187
  """
 
258
  # Self-closing tag like <x id="1"/>
259
  if tag_id is None:
260
  tag_id = -1
261
+ # raise ValueError(f"Self-closing tag <{tag_name}/> missing id")
262
  runs.append({
263
  "text": "",
264
  "id": [f"{tag_name}_{tag_id}"],
 
268
  # Opening tag <tag id="...">
269
  if tag_id is None:
270
  tag_id = -1
271
+ # raise ValueError(f"Opening tag <{tag_name}> missing id")
272
  tag_stack.append(f"{tag_name}_{tag_id}")
273
 
274
  pos = end
 
559
  out_file.write("".join(run["text"]))
560
 
561
  out_file.write("\n")