Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
File:
|
| 3 |
|
| 4 |
-
Description:
|
| 5 |
|
| 6 |
Author: Didier Guillevic
|
| 7 |
Date: 2024-09-07
|
|
@@ -16,8 +16,8 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
| 17 |
|
| 18 |
import model_translation as translation
|
| 19 |
-
from model_translation import tokenizer_multilingual
|
| 20 |
-
from model_translation import
|
| 21 |
|
| 22 |
from deep_translator import GoogleTranslator
|
| 23 |
|
|
@@ -116,7 +116,46 @@ def detect_language(text):
|
|
| 116 |
lang = langdetect.detect(text)
|
| 117 |
return lang
|
| 118 |
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
def translate_with_multilingual_model(
|
| 121 |
text: str,
|
| 122 |
tgt_lang: str,
|
|
@@ -124,7 +163,7 @@ def translate_with_multilingual_model(
|
|
| 124 |
input_max_length: int=512,
|
| 125 |
output_max_length: int=512):
|
| 126 |
"""
|
| 127 |
-
Translate the
|
| 128 |
"""
|
| 129 |
chunks = build_text_chunks(text, None, sents_per_chunk)
|
| 130 |
translated_chunks = []
|
|
@@ -139,7 +178,8 @@ def translate_with_multilingual_model(
|
|
| 139 |
model_multilingual.device)
|
| 140 |
outputs = model_multilingual.generate(
|
| 141 |
input_ids=input_ids, max_length=output_max_length)
|
| 142 |
-
translated_chunk = tokenizer_multilingual.decode(
|
|
|
|
| 143 |
translated_chunks.append(translated_chunk)
|
| 144 |
|
| 145 |
return '\n'.join(translated_chunks)
|
|
@@ -153,25 +193,27 @@ def translate_text(
|
|
| 153 |
"""
|
| 154 |
Translate the given text into English (default "easy" language)
|
| 155 |
"""
|
|
|
|
|
|
|
|
|
|
| 156 |
#
|
| 157 |
# Bilingual (Helsinki model)
|
| 158 |
#
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
translated_text_bilingual_model = translate_with_model(
|
| 169 |
-
text, tokenizer, model, src_lang, sents_per_chunk)
|
| 170 |
-
|
| 171 |
#
|
| 172 |
# Multilingual model (Google MADLAD)
|
| 173 |
#
|
| 174 |
-
|
| 175 |
translated_text_multilingual_model = translate_with_multilingual_model(
|
| 176 |
text, tgt_lang, sents_per_chunk, input_max_length, output_max_length)
|
| 177 |
|
|
@@ -183,6 +225,7 @@ def translate_text(
|
|
| 183 |
|
| 184 |
return (
|
| 185 |
translated_text_bilingual_model,
|
|
|
|
| 186 |
translated_text_multilingual_model,
|
| 187 |
translated_text_google_translate
|
| 188 |
)
|
|
@@ -207,6 +250,11 @@ with gr.Blocks() as demo:
|
|
| 207 |
label="Bilingual translation model (Helsinki NLP)",
|
| 208 |
render=False
|
| 209 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
output_text_multilingual_model = gr.Textbox(
|
| 211 |
lines=6,
|
| 212 |
label="Multilingual translation model (**small** Google MADLAD)",
|
|
@@ -250,6 +298,7 @@ with gr.Blocks() as demo:
|
|
| 250 |
outputs=[
|
| 251 |
output_text_bilingual_model,
|
| 252 |
output_text_multilingual_model,
|
|
|
|
| 253 |
output_text_google_translate,
|
| 254 |
],
|
| 255 |
additional_inputs=[sentences_per_chunk,],
|
|
|
|
| 1 |
"""
|
| 2 |
+
File: app.py
|
| 3 |
|
| 4 |
+
Description: Translate text...
|
| 5 |
|
| 6 |
Author: Didier Guillevic
|
| 7 |
Date: 2024-09-07
|
|
|
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
| 17 |
|
| 18 |
import model_translation as translation
|
| 19 |
+
from model_translation import tokenizer_multilingual, model_multilingual
|
| 20 |
+
from model_translation import tokenizer_m2m100, model_m2m100
|
| 21 |
|
| 22 |
from deep_translator import GoogleTranslator
|
| 23 |
|
|
|
|
| 116 |
lang = langdetect.detect(text)
|
| 117 |
return lang
|
| 118 |
|
| 119 |
+
def translate_with_bilingual_model(
|
| 120 |
+
text, src_lang, tgt_lang, sents_per_chunk
|
| 121 |
+
):
|
| 122 |
+
"""
|
| 123 |
+
Translate with Helsinki bilingual models
|
| 124 |
+
"""
|
| 125 |
+
if src_lang not in translation.src_langs:
|
| 126 |
+
return (
|
| 127 |
+
f"ISSUE: currently no model for language '{src_lang}'. "
|
| 128 |
+
"If wrong language, please specify language."
|
| 129 |
+
)
|
| 130 |
+
logger.info(f"LANG: {src_lang}, TEXT: {text[:50]}...")
|
| 131 |
+
tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang)
|
| 132 |
+
translated_text_bilingual_model = translate_with_model(
|
| 133 |
+
text, tokenizer, model, src_lang, sents_per_chunk)
|
| 134 |
+
return translated_text_bilingual_model
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
#@spaces.GPU
|
| 138 |
+
def translate_with_m2m100_model(
|
| 139 |
+
text: str,
|
| 140 |
+
src_lang: str,
|
| 141 |
+
tgt_lang: str,
|
| 142 |
+
sents_per_chunk: int=5):
|
| 143 |
+
"""
|
| 144 |
+
Translate with the m2m100 model
|
| 145 |
+
"""
|
| 146 |
+
tokenizer_m2m100.src_lang = src_lang
|
| 147 |
+
input_ids = tokenizer_m2m100(text, return_tensors="pt").input_ids.to(
|
| 148 |
+
model_m2m100.device)
|
| 149 |
+
outputs = model_m2m100.generate(
|
| 150 |
+
input_ids=input_ids,
|
| 151 |
+
forced_bos_token_id=tokenizer_m2m100.get_lang_id(tgt_lang)
|
| 152 |
+
)
|
| 153 |
+
translated_text = tokenizer_m2m100.batch_decode(
|
| 154 |
+
outputs[0], skip_special_tokens=True)
|
| 155 |
+
return translated_text
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
#@spaces.GPU
|
| 159 |
def translate_with_multilingual_model(
|
| 160 |
text: str,
|
| 161 |
tgt_lang: str,
|
|
|
|
| 163 |
input_max_length: int=512,
|
| 164 |
output_max_length: int=512):
|
| 165 |
"""
|
| 166 |
+
Translate the given text into English (default "easy" language)
|
| 167 |
"""
|
| 168 |
chunks = build_text_chunks(text, None, sents_per_chunk)
|
| 169 |
translated_chunks = []
|
|
|
|
| 178 |
model_multilingual.device)
|
| 179 |
outputs = model_multilingual.generate(
|
| 180 |
input_ids=input_ids, max_length=output_max_length)
|
| 181 |
+
translated_chunk = tokenizer_multilingual.decode(
|
| 182 |
+
outputs[0], skip_special_tokens=True)
|
| 183 |
translated_chunks.append(translated_chunk)
|
| 184 |
|
| 185 |
return '\n'.join(translated_chunks)
|
|
|
|
| 193 |
"""
|
| 194 |
Translate the given text into English (default "easy" language)
|
| 195 |
"""
|
| 196 |
+
src_lang = src_lang if (src_lang and src_lang != "auto") else detect_language(text)
|
| 197 |
+
tgt_lang = 'en' # Default "easy" language
|
| 198 |
+
|
| 199 |
#
|
| 200 |
# Bilingual (Helsinki model)
|
| 201 |
#
|
| 202 |
+
translated_text_bilingual_model = translate_with_bilingual_model(
|
| 203 |
+
text, src_lang, tgt_lang, sents_per_chunk
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
#
|
| 207 |
+
# m2m100 model
|
| 208 |
+
#
|
| 209 |
+
translated_text_m2m100_model = translate_with_m2m100_model(
|
| 210 |
+
text, src_lang, tgt_lang, sents_per_chunk
|
| 211 |
+
)
|
| 212 |
|
|
|
|
|
|
|
|
|
|
| 213 |
#
|
| 214 |
# Multilingual model (Google MADLAD)
|
| 215 |
#
|
| 216 |
+
|
| 217 |
translated_text_multilingual_model = translate_with_multilingual_model(
|
| 218 |
text, tgt_lang, sents_per_chunk, input_max_length, output_max_length)
|
| 219 |
|
|
|
|
| 225 |
|
| 226 |
return (
|
| 227 |
translated_text_bilingual_model,
|
| 228 |
+
|
| 229 |
translated_text_multilingual_model,
|
| 230 |
translated_text_google_translate
|
| 231 |
)
|
|
|
|
| 250 |
label="Bilingual translation model (Helsinki NLP)",
|
| 251 |
render=False
|
| 252 |
)
|
| 253 |
+
output_text_m2m100_model = gr.Textbox(
|
| 254 |
+
lines=6,
|
| 255 |
+
label="Facebook m2m100 translation model (**small**)",
|
| 256 |
+
render=False
|
| 257 |
+
)
|
| 258 |
output_text_multilingual_model = gr.Textbox(
|
| 259 |
lines=6,
|
| 260 |
label="Multilingual translation model (**small** Google MADLAD)",
|
|
|
|
| 298 |
outputs=[
|
| 299 |
output_text_bilingual_model,
|
| 300 |
output_text_multilingual_model,
|
| 301 |
+
output_text_m2m100_model,
|
| 302 |
output_text_google_translate,
|
| 303 |
],
|
| 304 |
additional_inputs=[sentences_per_chunk,],
|