File size: 3,416 Bytes
e179439 7291080 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 4c34e0b e179439 7566c59 080e470 7566c59 166238d 7566c59 5226c8a 7566c59 e179439 7291080 c5de144 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_OPTIONS = [
"Helsinki-NLP (Tira ondo)", # Round-trip OPUS-MT en→es→en
"FLAN-T5-base (Google gaizki xamar)"
]
# Cache
CACHE = {}
# --- FLAN loader (Google-style Euskera correction) ---
def load_flan():
if "flan" not in CACHE:
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
mdl = AutoModelForSeq2SeqLM.from_pretrained(
"google/flan-t5-base",
low_cpu_mem_usage=True,
torch_dtype="auto"
).to(DEVICE)
CACHE["flan"] = (mdl, tok)
return CACHE["flan"]
def run_flan(sentence: str) -> str:
model, tok = load_flan()
prompt = f"Euskara zuzen gramatikalki eta idatzi modu naturalean: {sentence}"
inputs = tok(prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=96, num_beams=4)
return tok.decode(out[0], skip_special_tokens=True).strip()
# --- Euskera round-trip loader ---
def load_euskera():
if "eus" not in CACHE:
tok1 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-eu-es")
mdl1 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-eu-es").to(DEVICE)
tok2 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-eu")
mdl2 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-es-eu").to(DEVICE)
CACHE["eus"] = (mdl1, tok1, mdl2, tok2)
return CACHE["eus"]
def run_roundtrip(sentence: str) -> str:
mdl1, tok1, mdl2, tok2 = load_euskera()
# Euskera → Spanish
inputs = tok1(sentence, return_tensors="pt").to(DEVICE)
es_tokens = mdl1.generate(**inputs, max_length=128, num_beams=4)
spanish = tok1.decode(es_tokens[0], skip_special_tokens=True)
# Spanish → Euskera
inputs2 = tok2(spanish, return_tensors="pt").to(DEVICE)
eu_tokens = mdl2.generate(**inputs2, max_length=128, num_beams=4)
euskera = tok2.decode(eu_tokens[0], skip_special_tokens=True)
return euskera.strip()
# --- Dispatcher ---
def polish(sentence: str, choice: str) -> str:
if not sentence.strip():
return ""
if choice.startswith("FLAN"):
return run_flan(sentence)
elif choice.startswith("Helsinki"):
return run_roundtrip(sentence)
else:
return "Unknown option."
# --- Gradio UI ---
with gr.Blocks(title="HizkuntzLagun: AI Euskera Zuzendu (CPU enabled)") as demo:
gr.Markdown("### HizkuntzLagun: AI Euskera Zuzedu\n")
gr.Markdown(
"""
> ⚡ **Oharra:**
> Tresna honek doako, CPU‑lagunko AI ereduak erabiltzen ditu.
> Azkarra eta eskuragarria izateko diseinatuta dago — ez perfektua.
> Zuzenketa azkarrak bai, ez analisi gramatikal sakonak.
> Edozein unetan erabil dezakezu — eguneroko zuzenketa txiki batek saihesten du esaldi traketsen lotsa.
""")
inp = gr.Textbox(lines=3, label="Idatzi Euskeraz esaldi bat", placeholder="Idatzi zuzentzeko esaldi bat...")
choice = gr.Dropdown(choices=MODEL_OPTIONS, value="Helsinki", label="Metodoa")
btn = gr.Button("Euskara zuzendu")
out = gr.Textbox(label="Zuzenketa")
btn.click(polish, inputs=[inp, choice], outputs=out)
if __name__ == "__main__":
demo.launch()
|