File size: 3,443 Bytes
e179439 7291080 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 4c34e0b e179439 7566c59 080e470 7566c59 4b17963 7566c59 166238d a1e5cbc 5226c8a e73a1af 7566c59 e179439 7291080 c5de144 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_OPTIONS = [
"Helsinki-NLP (Tira ondo)", # Round-trip OPUS-MT en→es→en
"FLAN-T5-base (Google gaizki xamar)"
]
# Cache
CACHE = {}
# --- FLAN loader (Google-style Euskera correction) ---
def load_flan():
if "flan" not in CACHE:
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
mdl = AutoModelForSeq2SeqLM.from_pretrained(
"google/flan-t5-base",
low_cpu_mem_usage=True,
torch_dtype="auto"
).to(DEVICE)
CACHE["flan"] = (mdl, tok)
return CACHE["flan"]
def run_flan(sentence: str) -> str:
model, tok = load_flan()
prompt = f"Euskara zuzen gramatikalki eta idatzi modu naturalean: {sentence}"
inputs = tok(prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=96, num_beams=4)
return tok.decode(out[0], skip_special_tokens=True).strip()
# --- Euskera round-trip loader ---
def load_euskera():
if "eus" not in CACHE:
tok1 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-eu-es")
mdl1 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-eu-es").to(DEVICE)
tok2 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-eu")
mdl2 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-es-eu").to(DEVICE)
CACHE["eus"] = (mdl1, tok1, mdl2, tok2)
return CACHE["eus"]
def run_roundtrip(sentence: str) -> str:
mdl1, tok1, mdl2, tok2 = load_euskera()
# Euskera → Spanish
inputs = tok1(sentence, return_tensors="pt").to(DEVICE)
es_tokens = mdl1.generate(**inputs, max_length=128, num_beams=4)
spanish = tok1.decode(es_tokens[0], skip_special_tokens=True)
# Spanish → Euskera
inputs2 = tok2(spanish, return_tensors="pt").to(DEVICE)
eu_tokens = mdl2.generate(**inputs2, max_length=128, num_beams=4)
euskera = tok2.decode(eu_tokens[0], skip_special_tokens=True)
return euskera.strip()
# --- Dispatcher ---
def polish(sentence: str, choice: str) -> str:
if not sentence.strip():
return ""
if choice.startswith("FLAN"):
return run_flan(sentence)
elif choice.startswith("Helsinki"):
return run_roundtrip(sentence)
else:
return "Unknown option."
# --- Gradio UI ---
with gr.Blocks(title="HizkuntzLagun: AI Euskera Zuzendu (CPU enabled)") as demo:
gr.Markdown("### HizkuntzLagun: AI Euskera Zuzedu\n")
gr.Markdown(
"""
> ⚡ **Oharra:**
> Tresna honek doako, CPU‑lagunko AI ereduak erabiltzen ditu.
> Azkarra eta eskuragarria izateko diseinatuta dago — ez beti perfektua.
> Zuzenketa azkarrak bai, ez analisi gramatikal sakonak.
> Edozein unetan erabil dezakezu — eguneroko zuzenketa txiki batek saihesten du esaldi traketsen lotsa.
""")
inp = gr.Textbox(lines=3, label="Idatzi Euskeraz esaldi bat, adibidez Gaur Kondo ikusi nuen.", placeholder="Idatzi esaldi bat...")
choice = gr.Dropdown(choices=MODEL_OPTIONS, value="Helsinki", label="Metodoa")
btn = gr.Button("Euskera zuzendu")
out = gr.Textbox(label="Zuzenketa")
btn.click(polish, inputs=[inp, choice], outputs=out)
if __name__ == "__main__":
demo.launch()
|