Spaces:
Sleeping
Sleeping
integrated any-doc into the gradle app, separated the translation side to make it easier to implement other translation models
Browse files- gradio_app.py +3 -5
- src/mtuoc_aina_translator.py +20 -0
- src/translate_any_doc.py +2 -21
gradio_app.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from
|
| 3 |
-
import
|
| 4 |
-
import json
|
| 5 |
-
from translate_docx import translate_document, translate, Aligner
|
| 6 |
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
| 7 |
|
| 8 |
|
|
@@ -16,7 +14,7 @@ detokenizer = TreebankWordDetokenizer()
|
|
| 16 |
|
| 17 |
|
| 18 |
def upload_file(filepath):
|
| 19 |
-
translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
|
| 20 |
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
| 21 |
|
| 22 |
def download_file():
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from src.translate_any_doc import translate_document, translate
|
| 3 |
+
from src.aligner import Aligner
|
|
|
|
|
|
|
| 4 |
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
| 5 |
|
| 6 |
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def upload_file(filepath):
|
| 17 |
+
translated_file_name = translate_document(filepath, source_lang, target_lang, aligner, detokenizer, ip)
|
| 18 |
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
| 19 |
|
| 20 |
def download_file():
|
src/mtuoc_aina_translator.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
from nltk.tokenize import sent_tokenize
|
| 4 |
+
|
| 5 |
+
class MTUOCAinaTranslator:
|
| 6 |
+
def __init__(self, ip: str, port: str):
|
| 7 |
+
self.ip = ip
|
| 8 |
+
self.port = port
|
| 9 |
+
|
| 10 |
+
def translate(self, text):
|
| 11 |
+
stuff = sent_tokenize(text)
|
| 12 |
+
|
| 13 |
+
myobj = {
|
| 14 |
+
'id': '1',
|
| 15 |
+
'src': text,
|
| 16 |
+
}
|
| 17 |
+
url = 'http://' + self.ip + ':' + self.port + '/translate'
|
| 18 |
+
x = requests.post(url, json=myobj)
|
| 19 |
+
json_response = json.loads(x.text)
|
| 20 |
+
return json_response['tgt']
|
src/translate_any_doc.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
import shutil
|
| 2 |
import time
|
| 3 |
-
import json
|
| 4 |
-
|
| 5 |
-
import requests
|
| 6 |
import os
|
| 7 |
from itertools import groupby
|
| 8 |
from subprocess import Popen, PIPE
|
|
@@ -18,21 +15,6 @@ import tqdm
|
|
| 18 |
nltk.download('punkt')
|
| 19 |
nltk.download('punkt_tab')
|
| 20 |
|
| 21 |
-
ip = "192.168.20.216"
|
| 22 |
-
port = "8000"
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def translate(text, ip, port):
|
| 26 |
-
myobj = {
|
| 27 |
-
'id': '1',
|
| 28 |
-
'src': text,
|
| 29 |
-
}
|
| 30 |
-
port = str(int(port))
|
| 31 |
-
url = 'http://' + ip + ':' + port + '/translate'
|
| 32 |
-
x = requests.post(url, json=myobj)
|
| 33 |
-
json_response = json.loads(x.text)
|
| 34 |
-
return json_response['tgt']
|
| 35 |
-
|
| 36 |
|
| 37 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
| 38 |
original_xliff_file_path: str) -> str:
|
|
@@ -268,11 +250,10 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
|
|
| 268 |
|
| 269 |
|
| 270 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
|
|
| 271 |
aligner: Aligner,
|
| 272 |
detokenizer,
|
| 273 |
-
ip: str = "192.168.20.216",
|
| 274 |
temp_folder: str = "tmp",
|
| 275 |
-
port: str = "8000",
|
| 276 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
| 277 |
input_filename = input_file.split("/")[-1]
|
| 278 |
# copy the original file to the temporal folder to avoid common issues with tikal
|
|
@@ -290,7 +271,7 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
| 290 |
translated_paragraphs = []
|
| 291 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
| 292 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
| 293 |
-
translated_paragraphs.append(translate(paragraph_text
|
| 294 |
|
| 295 |
# time to align the translation with the original
|
| 296 |
print("Generating alignments...")
|
|
|
|
| 1 |
import shutil
|
| 2 |
import time
|
|
|
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
from itertools import groupby
|
| 5 |
from subprocess import Popen, PIPE
|
|
|
|
| 15 |
nltk.download('punkt')
|
| 16 |
nltk.download('punkt_tab')
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
| 20 |
original_xliff_file_path: str) -> str:
|
|
|
|
| 250 |
|
| 251 |
|
| 252 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
| 253 |
+
translator,
|
| 254 |
aligner: Aligner,
|
| 255 |
detokenizer,
|
|
|
|
| 256 |
temp_folder: str = "tmp",
|
|
|
|
| 257 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
| 258 |
input_filename = input_file.split("/")[-1]
|
| 259 |
# copy the original file to the temporal folder to avoid common issues with tikal
|
|
|
|
| 271 |
translated_paragraphs = []
|
| 272 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
| 273 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
| 274 |
+
translated_paragraphs.append(translator.translate(paragraph_text))
|
| 275 |
|
| 276 |
# time to align the translation with the original
|
| 277 |
print("Generating alignments...")
|