|
|
""" |
|
|
language.py — robust loader + adapter for language.bin |
|
|
|
|
|
This loader attempts multiple safe options to load a local language model file `language.bin` |
|
|
and adapt it into a small, predictable translation API: |
|
|
- translate(text, src, tgt) |
|
|
- translate_to_en(text, src) |
|
|
- translate_from_en(text, tgt) |
|
|
- detect(text) / detect_language(text) (if provided by model) |
|
|
- model_info() for debugging |
|
|
|
|
|
Loading strategy (in order): |
|
|
1. If a language.py module is present (importable) we prefer it (the app already tries this). |
|
|
2. If language.bin exists: |
|
|
- Try to detect if it's a safetensors file and (if safetensors is installed) attempt to load. |
|
|
- Try torch.load with weights_only=True (safe for "weights-only" files). |
|
|
- If that fails and you explicitly allow insecure loading, try torch.load(..., weights_only=False). |
|
|
To allow this, set the environment variable LANGUAGE_LOAD_ALLOW_INSECURE=1. |
|
|
NOTE: loading with weights_only=False may execute arbitrary code from the file. Only do this |
|
|
when you trust the source of language.bin. |
|
|
- Try pickle.load as a last attempt (may fail for many binary formats). |
|
|
3. Fallback: no model loaded (the app will fall back to heuristics). |
|
|
|
|
|
Security note: |
|
|
- Re-running torch.load with weights_only=False can run arbitrary code embedded in the file. |
|
|
Only enable LANGUAGE_LOAD_ALLOW_INSECURE if you trust the file origin. |
|
|
""" |
|
|
|
|
|
from pathlib import Path |
|
|
import logging |
|
|
import importlib |
|
|
import io |
|
|
import sys |
|
|
|
|
|
logger = logging.getLogger("local_language") |
|
|
logger.setLevel(logging.INFO) |
|
|
|
|
|
_model = None |
|
|
_load_errors = [] |
|
|
|
|
|
def _try_import_language_module(): |
|
|
|
|
|
try: |
|
|
mod = importlib.import_module("language") |
|
|
logger.info("Found importable language.py module; using it.") |
|
|
return mod |
|
|
except Exception as e: |
|
|
_load_errors.append(("import_language_py", repr(e))) |
|
|
return None |
|
|
|
|
|
def _is_likely_safetensors(path: Path) -> bool: |
|
|
|
|
|
return path.suffix == ".safetensors" or path.name.endswith(".safetensors") |
|
|
|
|
|
def _try_safetensors_load(path: Path): |
|
|
try: |
|
|
from safetensors.torch import load_file as st_load |
|
|
except Exception as e: |
|
|
_load_errors.append(("safetensors_not_installed", repr(e))) |
|
|
return None |
|
|
try: |
|
|
tensors = st_load(str(path)) |
|
|
logger.info("Loaded safetensors file into tensor dict (language.bin treated as safetensors).") |
|
|
|
|
|
return tensors |
|
|
except Exception as e: |
|
|
_load_errors.append(("safetensors_load_failed", repr(e))) |
|
|
return None |
|
|
|
|
|
def _try_torch_load(path: Path, weights_only: bool): |
|
|
try: |
|
|
import torch |
|
|
except Exception as e: |
|
|
_load_errors.append(("torch_not_installed", repr(e))) |
|
|
return None |
|
|
try: |
|
|
|
|
|
obj = torch.load(str(path), map_location="cpu", weights_only=weights_only) |
|
|
logger.info(f"torch.load succeeded (weights_only={weights_only}).") |
|
|
return obj |
|
|
except TypeError as e: |
|
|
|
|
|
try: |
|
|
obj = torch.load(str(path), map_location="cpu") |
|
|
logger.info("torch.load succeeded (no weights_only argument supported by local torch).") |
|
|
return obj |
|
|
except Exception as e2: |
|
|
_load_errors.append(("torch_load_typeerror_then_failed", repr(e2))) |
|
|
return None |
|
|
except Exception as e: |
|
|
_load_errors.append((f"torch_load_failed_weights_only={weights_only}", repr(e))) |
|
|
return None |
|
|
|
|
|
def _try_pickle_load(path: Path): |
|
|
try: |
|
|
import pickle |
|
|
with open(path, "rb") as f: |
|
|
obj = pickle.load(f) |
|
|
logger.info("Loaded language.bin via pickle.") |
|
|
return obj |
|
|
except Exception as e: |
|
|
_load_errors.append(("pickle_load_failed", repr(e))) |
|
|
return None |
|
|
|
|
|
def _attempt_load(path: Path): |
|
|
|
|
|
if _is_likely_safetensors(path): |
|
|
logger.info("language.bin looks like safetensors (by filename). Attempting safetensors load.") |
|
|
obj = _try_safetensors_load(path) |
|
|
if obj is not None: |
|
|
return obj |
|
|
|
|
|
|
|
|
obj = _try_torch_load(path, weights_only=True) |
|
|
if obj is not None: |
|
|
return obj |
|
|
|
|
|
|
|
|
allow_insecure = str(os.environ.get("LANGUAGE_LOAD_ALLOW_INSECURE", "")).lower() in ("1", "true", "yes") |
|
|
if allow_insecure: |
|
|
logger.warning("LANGUAGE_LOAD_ALLOW_INSECURE is set -> attempting torch.load with weights_only=False (INSECURE).") |
|
|
obj = _try_torch_load(path, weights_only=False) |
|
|
if obj is not None: |
|
|
return obj |
|
|
else: |
|
|
logger.warning("torch.load(weights_only=False) failed or returned None.") |
|
|
|
|
|
|
|
|
obj = _try_pickle_load(path) |
|
|
if obj is not None: |
|
|
return obj |
|
|
|
|
|
return None |
|
|
|
|
|
def _load_language_bin_if_present(): |
|
|
global _model |
|
|
p = Path("language.bin") |
|
|
if not p.exists(): |
|
|
return None |
|
|
logger.info("language.bin found; attempting to load with safe fallbacks...") |
|
|
|
|
|
obj = _attempt_load(p) |
|
|
if obj is None: |
|
|
logger.warning("All attempts to load language.bin failed. See _load_errors for details.") |
|
|
else: |
|
|
_model = obj |
|
|
return obj |
|
|
|
|
|
def load(): |
|
|
""" |
|
|
Public loader. Returns the loaded model/object or None. |
|
|
""" |
|
|
global _model |
|
|
|
|
|
mod = _try_import_language_module() |
|
|
if mod is not None: |
|
|
_model = mod |
|
|
return _model |
|
|
|
|
|
obj = _load_language_bin_if_present() |
|
|
return obj |
|
|
|
|
|
|
|
|
try: |
|
|
load() |
|
|
except Exception as e: |
|
|
logger.warning(f"language.py loader encountered error during import: {e}") |
|
|
|
|
|
|
|
|
|
|
|
def model_info() -> dict: |
|
|
""" |
|
|
Return a small summary about the loaded model/object to help debugging. |
|
|
""" |
|
|
info = {"loaded": False, "type": None, "repr": None, "load_errors": list(_load_errors)[:20], "has_translate": False, "has_detect": False, "callable": False} |
|
|
if _model is None: |
|
|
return info |
|
|
info["loaded"] = True |
|
|
try: |
|
|
info["type"] = type(_model).__name__ |
|
|
except Exception: |
|
|
info["type"] = "<unknown>" |
|
|
try: |
|
|
info["repr"] = repr(_model)[:1000] |
|
|
except Exception: |
|
|
info["repr"] = "<unreprable>" |
|
|
try: |
|
|
info["has_translate"] = hasattr(_model, "translate") |
|
|
info["has_translate_to_en"] = hasattr(_model, "translate_to_en") |
|
|
info["has_translate_from_en"] = hasattr(_model, "translate_from_en") |
|
|
info["has_detect"] = hasattr(_model, "detect") or hasattr(_model, "detect_language") |
|
|
info["callable"] = callable(_model) |
|
|
if hasattr(_model, "__dir__"): |
|
|
try: |
|
|
info["dir"] = [n for n in dir(_model) if not n.startswith("_")] |
|
|
except Exception: |
|
|
info["dir"] = [] |
|
|
except Exception: |
|
|
pass |
|
|
return info |
|
|
|
|
|
def _safe_call_translate(text: str, src: str, tgt: str) -> str: |
|
|
""" |
|
|
Try multiple call patterns to invoke translation functions on the loaded object. |
|
|
Fall back to returning original text if nothing works. |
|
|
""" |
|
|
if _model is None: |
|
|
return text |
|
|
|
|
|
try: |
|
|
if hasattr(_model, "translate"): |
|
|
try: |
|
|
return _model.translate(text, src, tgt) |
|
|
except TypeError: |
|
|
try: |
|
|
|
|
|
return _model.translate(text, f"{src}->{tgt}") |
|
|
except Exception: |
|
|
pass |
|
|
except Exception as e: |
|
|
logger.debug(f"_model.translate attempt failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
if tgt.lower() in ("en", "eng") and hasattr(_model, "translate_to_en"): |
|
|
return _model.translate_to_en(text, src) |
|
|
except Exception as e: |
|
|
logger.debug(f"_model.translate_to_en attempt failed: {e}") |
|
|
try: |
|
|
if src.lower() in ("en", "eng") and hasattr(_model, "translate_from_en"): |
|
|
return _model.translate_from_en(text, tgt) |
|
|
except Exception as e: |
|
|
logger.debug(f"_model.translate_from_en attempt failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
if callable(_model): |
|
|
try: |
|
|
return _model(text, src, tgt) |
|
|
except TypeError: |
|
|
try: |
|
|
return _model(text, src) |
|
|
except TypeError: |
|
|
try: |
|
|
return _model(text) |
|
|
except Exception: |
|
|
pass |
|
|
except Exception as e: |
|
|
logger.debug(f"_model callable attempts failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
tokenizer = getattr(_model, "tokenizer", None) |
|
|
generate = getattr(_model, "generate", None) |
|
|
if tokenizer and generate: |
|
|
inputs = tokenizer([text], return_tensors="pt", truncation=True) |
|
|
outputs = _model.generate(**inputs, max_length=1024) |
|
|
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
return decoded |
|
|
except Exception as e: |
|
|
logger.debug(f"_model HF-style generate attempt failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
if isinstance(_model, dict): |
|
|
key = (src, tgt) |
|
|
if key in _model: |
|
|
val = _model[key] |
|
|
if callable(val): |
|
|
return val(text) |
|
|
if isinstance(val, str): |
|
|
return val |
|
|
key2 = f"{src}->{tgt}" |
|
|
if key2 in _model: |
|
|
val = _model[key2] |
|
|
if callable(val): |
|
|
return val(text) |
|
|
if isinstance(val, str): |
|
|
return val |
|
|
except Exception as e: |
|
|
logger.debug(f"_model dict-like attempt failed: {e}") |
|
|
|
|
|
|
|
|
return text |
|
|
|
|
|
def translate(text: str, src: str, tgt: str) -> str: |
|
|
if not text: |
|
|
return text |
|
|
return _safe_call_translate(text, src or "und", tgt or "und") |
|
|
|
|
|
def translate_to_en(text: str, src: str) -> str: |
|
|
if not text: |
|
|
return text |
|
|
|
|
|
try: |
|
|
if _model is not None and hasattr(_model, "translate_to_en"): |
|
|
return _model.translate_to_en(text, src) |
|
|
except Exception: |
|
|
pass |
|
|
return translate(text, src, "en") |
|
|
|
|
|
def translate_from_en(text: str, tgt: str) -> str: |
|
|
if not text: |
|
|
return text |
|
|
try: |
|
|
if _model is not None and hasattr(_model, "translate_from_en"): |
|
|
return _model.translate_from_en(text, tgt) |
|
|
except Exception: |
|
|
pass |
|
|
return translate(text, "en", tgt) |
|
|
|
|
|
def detect(text: str) -> str: |
|
|
""" |
|
|
Call detection if the model exposes it. Returns None if not available. |
|
|
""" |
|
|
if not text: |
|
|
return None |
|
|
if _model is None: |
|
|
return None |
|
|
try: |
|
|
if hasattr(_model, "detect_language"): |
|
|
return _model.detect_language(text) |
|
|
if hasattr(_model, "detect"): |
|
|
return _model.detect(text) |
|
|
except Exception as e: |
|
|
logger.debug(f"model detect attempt failed: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
print("model_info:", model_info()) |
|
|
if len(sys.argv) >= 4: |
|
|
src = sys.argv[1] |
|
|
tgt = sys.argv[2] |
|
|
txt = " ".join(sys.argv[3:]) |
|
|
print("translate:", translate(txt, src, tgt)) |
|
|
else: |
|
|
print("Usage: python language.py <src> <tgt> <text...>") |
|
|
print("Example: python language.py es en 'hola mundo'") |