Spaces:
Sleeping
Sleeping
Updated readme and added salamandraTA7b translator class
Browse files- readme.md +10 -1
- src/mtuoc_aina_translator.py +4 -5
- src/salamandraTA7b_translator.py +22 -0
- src/translate_any_doc.py +1 -1
readme.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# document_translator
|
| 2 |
|
| 3 |
-
Project to translate files
|
| 4 |
|
| 5 |
## Requirements
|
| 6 |
### python 3.12
|
|
@@ -16,3 +16,12 @@ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from ht
|
|
| 16 |
### python requirements
|
| 17 |
|
| 18 |
pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# document_translator
|
| 2 |
|
| 3 |
+
Project to translate files using BSC's models while keeping the formatting and style of the original file.
|
| 4 |
|
| 5 |
## Requirements
|
| 6 |
### python 3.12
|
|
|
|
| 16 |
### python requirements
|
| 17 |
|
| 18 |
pip install -r requirements.txt
|
| 19 |
+
|
| 20 |
+
### mtuoc_aina_translator
|
| 21 |
+
|
| 22 |
+
To use this class you also need to be running MTUOC's translation server with the proper translation models. There's also no
|
| 23 |
+
need to use fastalign on that side since the current project already runs it.
|
| 24 |
+
|
| 25 |
+
### salamandrata7b_translator
|
| 26 |
+
|
| 27 |
+
Class that uses huggingface's demo.
|
src/mtuoc_aina_translator.py
CHANGED
|
@@ -1,20 +1,19 @@
|
|
| 1 |
import requests
|
| 2 |
import json
|
| 3 |
-
|
| 4 |
|
| 5 |
class MTUOCAinaTranslator:
|
| 6 |
def __init__(self, ip: str, port: str):
|
| 7 |
self.ip = ip
|
| 8 |
self.port = port
|
| 9 |
|
| 10 |
-
def translate(self, text):
|
| 11 |
-
stuff = sent_tokenize(text)
|
| 12 |
-
|
| 13 |
myobj = {
|
| 14 |
'id': '1',
|
| 15 |
'src': text,
|
| 16 |
}
|
| 17 |
-
url = 'http://
|
|
|
|
| 18 |
x = requests.post(url, json=myobj)
|
| 19 |
json_response = json.loads(x.text)
|
| 20 |
return json_response['tgt']
|
|
|
|
| 1 |
import requests
|
| 2 |
import json
|
| 3 |
+
|
| 4 |
|
| 5 |
class MTUOCAinaTranslator:
|
| 6 |
def __init__(self, ip: str, port: str):
|
| 7 |
self.ip = ip
|
| 8 |
self.port = port
|
| 9 |
|
| 10 |
+
def translate(self, text, source_lang=None, target_lang=None):
|
|
|
|
|
|
|
| 11 |
myobj = {
|
| 12 |
'id': '1',
|
| 13 |
'src': text,
|
| 14 |
}
|
| 15 |
+
url = f'http://{self.ip}:{self.port}/translate'
|
| 16 |
+
#url = 'http://' + self.ip + ':' + self.port + '/translate'
|
| 17 |
x = requests.post(url, json=myobj)
|
| 18 |
json_response = json.loads(x.text)
|
| 19 |
return json_response['tgt']
|
src/salamandraTA7b_translator.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gradio_client import Client
|
| 2 |
+
from iso639 import languages
|
| 3 |
+
|
| 4 |
+
HF_TOKEN = "YOUR-HF-TOKEN-HERE"
|
| 5 |
+
|
| 6 |
+
class SalamandraTA7bTranslator:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=HF_TOKEN)
|
| 9 |
+
|
| 10 |
+
def translate(self, text, source_lang, target_lang):
|
| 11 |
+
# we assume that they are specifying the language by code so we need to convert it to name
|
| 12 |
+
lang1 = languages.get(alpha2=source_lang).name
|
| 13 |
+
lang2 = languages.get(alpha2=target_lang).name
|
| 14 |
+
result = self.client.predict(
|
| 15 |
+
task="Translation",
|
| 16 |
+
source=lang1,
|
| 17 |
+
target=lang2,
|
| 18 |
+
input_text=text,
|
| 19 |
+
mt_text=None,
|
| 20 |
+
api_name="/generate_output"
|
| 21 |
+
)
|
| 22 |
+
return result[0]
|
src/translate_any_doc.py
CHANGED
|
@@ -271,7 +271,7 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
| 271 |
translated_paragraphs = []
|
| 272 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
| 273 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
| 274 |
-
translated_paragraphs.append(translator.translate(paragraph_text))
|
| 275 |
|
| 276 |
# time to align the translation with the original
|
| 277 |
print("Generating alignments...")
|
|
|
|
| 271 |
translated_paragraphs = []
|
| 272 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
| 273 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
| 274 |
+
translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
|
| 275 |
|
| 276 |
# time to align the translation with the original
|
| 277 |
print("Generating alignments...")
|