File size: 9,935 Bytes
2e463ee
d120873
4ee4236
d120873
0ebc942
199455f
d120873
 
ad9a50c
4ee4236
 
 
 
 
 
d120873
dbafcc0
 
 
 
 
 
0b7d461
dbafcc0
 
 
 
 
 
 
 
 
48fd296
 
4939d8b
48fd296
 
4939d8b
bd61d8c
2e5ad82
2aa6189
a2c2946
2aa6189
2e5ad82
ade20d4
a03e099
 
 
2e5ad82
a03e099
 
 
2aa6189
 
891bd81
 
 
 
 
 
2aa6189
 
bd61d8c
 
 
 
 
 
2096aea
 
bd61d8c
 
2096aea
 
79f676d
2096aea
bd61d8c
 
dddf264
bd61d8c
 
5b0f1e6
2096aea
 
79f676d
bd61d8c
 
704fcda
 
17d4a5a
bd61d8c
 
 
79f676d
bd61d8c
 
 
 
dbafcc0
d22cb09
2e463ee
 
d22cb09
 
 
d120873
d22cb09
 
95b5309
d22cb09
b8db721
95b5309
 
 
49ae858
d22cb09
 
 
 
 
 
 
 
 
49ae858
95b5309
 
 
d22cb09
 
 
 
 
 
 
 
 
 
 
 
37ded96
 
2e5046d
fb22ed5
39ababd
 
088720e
 
2afdfd3
 
fb5c2d4
 
2e5046d
 
 
dce1a40
 
fb5c2d4
 
 
 
 
 
 
275593f
 
 
c55d8e2
275593f
d550028
b41b21e
c55d8e2
 
73d63a7
b41b21e
 
 
 
b48c77c
 
275593f
c55d8e2
 
 
 
 
 
 
 
 
 
 
fb5c2d4
 
c55d8e2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
import torch
import src.exception.Exception as ExceptionCustom
import polars as pl

METHOD = "TRANSLATE"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pl.read_parquet("isolanguages.parquet")
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
# all_langs = languagecodes.iso_languages_byname
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}

class Translators:
    def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
        self.model_name = model_name
        self.sl, self.tl = sl, tl
        self.input_text = input_text
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
    def HelsinkiNLP_mulroa(self):
        try:
            pipe = pipeline("translation", model=self.model_name, device=self.device)                
            iso1to3 = {iso[1]: iso[3] for iso in non_empty_isos} # {'ro': 'ron'}
            iso3tl = iso1to3.get(self.tl) # 'deu', 'ron', 'eng', 'fra'
            translation = pipe(f'>>{iso3tl}<< {self.input_text}')
            return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
        except Exception as error:
            return f"Error translating with model: {self.model_name}! Try other available language combination.", error
    def text2textgenerationpipe(self):
        translation  = pipeline('text2text-generation', model = self.model_name)
        return translation(self.input_text)[0]['generated_text'], self.message
    def translationpipe(self):
        translation  = pipeline('translation', model = self.model_name)
        return translation(self.input_text)[0]['translation_text'], self.message
    def mbartlarge25(self):
        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, MBartTokenizer
        src_lang = f"{self.sl}_XX"
        tgt_lang = f"{self.tl}_{self.tl.upper()}"
        # Load model and tokenizer
        # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        tokenizer = AutoTokenizer.from_pretrained(self.model_name, src_lang=src_lang)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
        # model = MBartForConditionalGeneration.from_pretrained(self.model_name)
        # tokenizer = MBartTokenizer.from_pretrained(self.model_name, src_lang=src_lang)
        # pipe = pipeline("translation", model="facebook/mbart-large-cc25")
        # translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
        # translated_text = translator(text, max_length=512)
        # return translated_text[0]['translation_text']
        # Tokenize and translate
        inputs = tokenizer(self.input_text, return_tensors="pt")
        translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=512,  # Add max_length to avoid truncation
        num_beams=4)      # Use beam search for better results
        print(src_lang, tgt_lang, tokenizer.lang_code_to_id[tgt_lang])
        translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return translation, self.message
    def mbartlarge(self):
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
        
        model_name = "facebook/mbart-large-cc25"
        
        # load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        # tell tokenizer the source language
        tokenizer.src_lang = "en_XX"
        tokenizer.tgt_lang = "ro_RO"
        # set the target language as the model's forced BOS token so pipeline will use it implicitly
        model.config.forced_bos_token_id = tokenizer.lang_code_to_id["ro_RO"]
        
        # find the id for the target language and force it at generation
        # forced_bos_token_id = tokenizer.lang_code_to_id["ro_RO"]
        
        # create the pipeline (pass tokenizer and model explicitly)
        # export langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
        pipe = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="en_XX", tgt_lang="ro_RO")
        # pipe = pipeline("translation_en_to_de", model="facebook/mbart-large-cc25")
        # "translation" task was used, instead of "translation_XX_to_YY", defaulting to "translation_en_to_ro"
        
        # call the pipeline; generation kwargs are forwarded to model.generate
        # src_lang (str, optional) — The language of the input.
        # tgt_lang (str, optional) — The language of the desired output. Might be required for multilingual models. Will not have any effect for single pair translation models
        src_text = "Check general exterior conditions"
        result = pipe(
            src_text,
            num_beams=4,
            max_length=256
        )
        
        return result[0]["translation_text"], self.message


def paraphraseTranslateMethod(requestValue: str, model: str):
    nltk.download('punkt')
    nltk.download('punkt_tab')
    exception = ExceptionCustom.checkForException(requestValue, METHOD)
    if exception:
        return "", exception

    tokenized_sent_list = sent_tokenize(requestValue)
    result_value = []
    
    for SENTENCE in tokenized_sent_list:
        if model == 'roen':
            tokenizerROMENG = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-ro-en")
            modelROMENG = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-ro-en")
            modelROMENG.to(device)
            input_ids = tokenizerROMENG(SENTENCE, return_tensors='pt').to(device)
            output = modelROMENG.generate(
                input_ids=input_ids.input_ids,
                do_sample=True,
                max_length=512,
                top_k=90,
                top_p=0.97,
                early_stopping=False
            )
            result = tokenizerROMENG.batch_decode(output, skip_special_tokens=True)[0]
        else:
            tokenizerENGROM = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-en-ro")
            modelENGROM = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-en-ro")
            modelENGROM.to(device)
            input_ids = tokenizerENGROM(SENTENCE, return_tensors='pt').to(device)
            output = modelENGROM.generate(
                input_ids=input_ids.input_ids,
                do_sample=True,
                max_length=512,
                top_k=90,
                top_p=0.97,
                early_stopping=False
            )
            result = tokenizerENGROM.batch_decode(output, skip_special_tokens=True)[0]
        result_value.append(result)

    return " ".join(result_value).strip(), model

def gemma(requestValue: str, model: str = 'Gargaz/gemma-2b-romanian-better'):
    requestValue = requestValue.replace('\n', ' ')
    prompt = f"Translate this to Romanian using a formal tone, responding only with the translated text: {requestValue}"
    messages = [{"role": "user", "content": f"Translate this text to Romanian: {requestValue}"}]
    if '/' not in model:
        model = 'Gargaz/gemma-2b-romanian-better'
    # limit max_new_tokens to 150% of the requestValue  
    max_new_tokens = int(len(requestValue) + len(requestValue) * 0.5)
    try:
        pipe = pipeline(
        "text-generation",
        model=model,
        device=-1,
        max_new_tokens=max_new_tokens,   # Keep short to reduce verbosity
        do_sample=False  # Use greedy decoding for determinism
        )
        output = pipe(messages, num_return_sequences=1, return_full_text=False)
        generated_text = output[0]["generated_text"]
        result = generated_text.split('\n', 1)[0] if '\n' in generated_text else generated_text
        return result.strip()
    except Exception as error:
        return error

def gemma_direct(requestValue: str, model: str = 'Gargaz/gemma-2b-romanian-better'):
    # Load model directly
    model_name = model if '/' in model else 'Gargaz/gemma-2b-romanian-better'
    # limit max_new_tokens to 150% of the requestValue  
    prompt = f"Translate this text to Romanian: {requestValue}"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    input_ids = tokenizer.encode(requestValue, add_special_tokens=True)
    num_tokens = len(input_ids)
    # Estimate output length (e.g., 50% longer)
    max_new_tokens = int(num_tokens * 1.5)
    max_new_tokens += max_new_tokens % 2  # ensure it's even
       
    messages = [{"role": "user", "content": prompt}]
    
    try:
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(device)
    
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
        result = response.split('\n', 1)[0] if '\n' in response else response 
        return result.strip()
    except Exception as error:
        return error