Spaces:
Running
Running
File size: 9,935 Bytes
2e463ee d120873 4ee4236 d120873 0ebc942 199455f d120873 ad9a50c 4ee4236 d120873 dbafcc0 0b7d461 dbafcc0 48fd296 4939d8b 48fd296 4939d8b bd61d8c 2e5ad82 2aa6189 a2c2946 2aa6189 2e5ad82 ade20d4 a03e099 2e5ad82 a03e099 2aa6189 891bd81 2aa6189 bd61d8c 2096aea bd61d8c 2096aea 79f676d 2096aea bd61d8c dddf264 bd61d8c 5b0f1e6 2096aea 79f676d bd61d8c 704fcda 17d4a5a bd61d8c 79f676d bd61d8c dbafcc0 d22cb09 2e463ee d22cb09 d120873 d22cb09 95b5309 d22cb09 b8db721 95b5309 49ae858 d22cb09 49ae858 95b5309 d22cb09 37ded96 2e5046d fb22ed5 39ababd 088720e 2afdfd3 fb5c2d4 2e5046d dce1a40 fb5c2d4 275593f c55d8e2 275593f d550028 b41b21e c55d8e2 73d63a7 b41b21e b48c77c 275593f c55d8e2 fb5c2d4 c55d8e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
import torch
import src.exception.Exception as ExceptionCustom
import polars as pl
METHOD = "TRANSLATE"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pl.read_parquet("isolanguages.parquet")
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
# all_langs = languagecodes.iso_languages_byname
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
class Translators:
def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
self.model_name = model_name
self.sl, self.tl = sl, tl
self.input_text = input_text
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
def HelsinkiNLP_mulroa(self):
try:
pipe = pipeline("translation", model=self.model_name, device=self.device)
iso1to3 = {iso[1]: iso[3] for iso in non_empty_isos} # {'ro': 'ron'}
iso3tl = iso1to3.get(self.tl) # 'deu', 'ron', 'eng', 'fra'
translation = pipe(f'>>{iso3tl}<< {self.input_text}')
return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
except Exception as error:
return f"Error translating with model: {self.model_name}! Try other available language combination.", error
def text2textgenerationpipe(self):
translation = pipeline('text2text-generation', model = self.model_name)
return translation(self.input_text)[0]['generated_text'], self.message
def translationpipe(self):
translation = pipeline('translation', model = self.model_name)
return translation(self.input_text)[0]['translation_text'], self.message
def mbartlarge25(self):
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, MBartTokenizer
src_lang = f"{self.sl}_XX"
tgt_lang = f"{self.tl}_{self.tl.upper()}"
# Load model and tokenizer
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained(self.model_name, src_lang=src_lang)
model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
# model = MBartForConditionalGeneration.from_pretrained(self.model_name)
# tokenizer = MBartTokenizer.from_pretrained(self.model_name, src_lang=src_lang)
# pipe = pipeline("translation", model="facebook/mbart-large-cc25")
# translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
# translated_text = translator(text, max_length=512)
# return translated_text[0]['translation_text']
# Tokenize and translate
inputs = tokenizer(self.input_text, return_tensors="pt")
translated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
max_length=512, # Add max_length to avoid truncation
num_beams=4) # Use beam search for better results
print(src_lang, tgt_lang, tokenizer.lang_code_to_id[tgt_lang])
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
return translation, self.message
def mbartlarge(self):
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
model_name = "facebook/mbart-large-cc25"
# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# tell tokenizer the source language
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "ro_RO"
# set the target language as the model's forced BOS token so pipeline will use it implicitly
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["ro_RO"]
# find the id for the target language and force it at generation
# forced_bos_token_id = tokenizer.lang_code_to_id["ro_RO"]
# create the pipeline (pass tokenizer and model explicitly)
# export langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
pipe = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="en_XX", tgt_lang="ro_RO")
# pipe = pipeline("translation_en_to_de", model="facebook/mbart-large-cc25")
# "translation" task was used, instead of "translation_XX_to_YY", defaulting to "translation_en_to_ro"
# call the pipeline; generation kwargs are forwarded to model.generate
# src_lang (str, optional) — The language of the input.
# tgt_lang (str, optional) — The language of the desired output. Might be required for multilingual models. Will not have any effect for single pair translation models
src_text = "Check general exterior conditions"
result = pipe(
src_text,
num_beams=4,
max_length=256
)
return result[0]["translation_text"], self.message
def paraphraseTranslateMethod(requestValue: str, model: str):
nltk.download('punkt')
nltk.download('punkt_tab')
exception = ExceptionCustom.checkForException(requestValue, METHOD)
if exception:
return "", exception
tokenized_sent_list = sent_tokenize(requestValue)
result_value = []
for SENTENCE in tokenized_sent_list:
if model == 'roen':
tokenizerROMENG = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-ro-en")
modelROMENG = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-ro-en")
modelROMENG.to(device)
input_ids = tokenizerROMENG(SENTENCE, return_tensors='pt').to(device)
output = modelROMENG.generate(
input_ids=input_ids.input_ids,
do_sample=True,
max_length=512,
top_k=90,
top_p=0.97,
early_stopping=False
)
result = tokenizerROMENG.batch_decode(output, skip_special_tokens=True)[0]
else:
tokenizerENGROM = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-en-ro")
modelENGROM = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-en-ro")
modelENGROM.to(device)
input_ids = tokenizerENGROM(SENTENCE, return_tensors='pt').to(device)
output = modelENGROM.generate(
input_ids=input_ids.input_ids,
do_sample=True,
max_length=512,
top_k=90,
top_p=0.97,
early_stopping=False
)
result = tokenizerENGROM.batch_decode(output, skip_special_tokens=True)[0]
result_value.append(result)
return " ".join(result_value).strip(), model
def gemma(requestValue: str, model: str = 'Gargaz/gemma-2b-romanian-better'):
requestValue = requestValue.replace('\n', ' ')
prompt = f"Translate this to Romanian using a formal tone, responding only with the translated text: {requestValue}"
messages = [{"role": "user", "content": f"Translate this text to Romanian: {requestValue}"}]
if '/' not in model:
model = 'Gargaz/gemma-2b-romanian-better'
# limit max_new_tokens to 150% of the requestValue
max_new_tokens = int(len(requestValue) + len(requestValue) * 0.5)
try:
pipe = pipeline(
"text-generation",
model=model,
device=-1,
max_new_tokens=max_new_tokens, # Keep short to reduce verbosity
do_sample=False # Use greedy decoding for determinism
)
output = pipe(messages, num_return_sequences=1, return_full_text=False)
generated_text = output[0]["generated_text"]
result = generated_text.split('\n', 1)[0] if '\n' in generated_text else generated_text
return result.strip()
except Exception as error:
return error
def gemma_direct(requestValue: str, model: str = 'Gargaz/gemma-2b-romanian-better'):
# Load model directly
model_name = model if '/' in model else 'Gargaz/gemma-2b-romanian-better'
# limit max_new_tokens to 150% of the requestValue
prompt = f"Translate this text to Romanian: {requestValue}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
input_ids = tokenizer.encode(requestValue, add_special_tokens=True)
num_tokens = len(input_ids)
# Estimate output length (e.g., 50% longer)
max_new_tokens = int(num_tokens * 1.5)
max_new_tokens += max_new_tokens % 2 # ensure it's even
messages = [{"role": "user", "content": prompt}]
try:
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(device)
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
result = response.split('\n', 1)[0] if '\n' in response else response
return result.strip()
except Exception as error:
return error
|