Spaces:

Princeaka
/

justiceai

Running

App Files Files Community

Princeaka commited on 9 days ago

Commit

55ce9fc

verified ·

1 Parent(s): 769f383

Update language.py

Browse files

Files changed (1) hide show

language.py +260 -102

language.py CHANGED Viewed

@@ -1,117 +1,276 @@
-# language.py — wrapper to expose a stable translation API for JusticeAI.
-# Tries to load language.bin (torch.load then pickle). Adapts common shapes and exposes:
-# - translate(text, src, tgt)
-# - translate_to_en(text, src)
-# - translate_from_en(text, tgt)
-# Also exposes model_info() for debugging/inspection.
 from pathlib import Path
 import logging
-import pickle
-import types
 logger = logging.getLogger("local_language")
 logger.setLevel(logging.INFO)
 _model = None
-_model_meta = {}
-def _load_bin(path: Path):
-    global _model
     try:
         import torch
-        _model = torch.load(str(path), map_location="cpu")
-        logger.info("Loaded language.bin via torch.load")
-        return
     except Exception as e:
-        logger.info(f"torch.load failed for language.bin: {e}")
     try:
         with open(path, "rb") as f:
-            _model = pickle.load(f)
-        logger.info("Loaded language.bin via pickle")
-        return
     except Exception as e:
-        logger.warning(f"pickle load failed for language.bin: {e}")
-    _model = None
-def _ensure_loaded():
     global _model
-    if _model is not None:
-        return
     p = Path("language.bin")
-    if p.exists():
-        _load_bin(p)
     else:
-        logger.info("language.bin not found in cwd")
-def model_info():
-    _ensure_loaded()
     if _model is None:
-        return {"loaded": False}
-    info = {"loaded": True, "type": type(_model).__name__}
     try:
-        info["repr"] = repr(_model)[:800]
     except Exception:
         info["repr"] = "<unreprable>"
-    info["has_translate"] = hasattr(_model, "translate")
-    info["has_translate_to_en"] = hasattr(_model, "translate_to_en")
-    info["has_translate_from_en"] = hasattr(_model, "translate_from_en")
-    info["callable"] = callable(_model)
-    info["dir"] = [n for n in dir(_model) if not n.startswith("_")]
     return info
-def translate(text: str, src: str, tgt: str) -> str:
-    _ensure_loaded()
-    if not text:
-        return text
     if _model is None:
         return text
-    # 1) object has translate(text, src, tgt) or translate_to_en
     try:
         if hasattr(_model, "translate"):
             try:
                 return _model.translate(text, src, tgt)
             except TypeError:
                 try:
                     return _model.translate(text, f"{src}->{tgt}")
                 except Exception:
                     pass
-        if hasattr(_model, "translate_to_en") and tgt.lower() in ("en", "eng"):
-            try:
-                return _model.translate_to_en(text, src)
-            except Exception:
-                pass
-        if hasattr(_model, "translate_from_en") and src.lower() in ("en", "eng"):
-            try:
-                return _model.translate_from_en(text, tgt)
-            except Exception:
-                pass
     except Exception as e:
-        logger.debug(f"model.translate() attempt failed: {e}")
-    # 2) callable model (e.g., simple function)
     try:
         if callable(_model):
             try:
                 return _model(text, src, tgt)
             except TypeError:
                 try:
-                    return _model(text)
-                except Exception:
-                    pass
     except Exception as e:
-        logger.debug(f"callable model attempt failed: {e}")
-    # 3) dict-like mapping (('src','tgt') -> function or string)
     try:
         if isinstance(_model, dict):
             key = (src, tgt)
             if key in _model:
-                fn = _model[key]
-                if callable(fn):
-                    return fn(text)
-                if isinstance(fn, str):
-                    return fn
             key2 = f"{src}->{tgt}"
             if key2 in _model:
                 val = _model[key2]
@@ -120,64 +279,63 @@ def translate(text: str, src: str, tgt: str) -> str:
                 if isinstance(val, str):
                     return val
     except Exception as e:
-        logger.debug(f"dict-like model attempt failed: {e}")
-    # 4) HF-like object: has .generate and maybe a tokenizer at _model.tokenizer
-    try:
-        m = _model
-        tokenizer = getattr(m, "tokenizer", None)
-        if tokenizer and hasattr(m, "generate"):
-            inputs = tokenizer([text], return_tensors="pt", truncation=True)
-            outputs = m.generate(**inputs, max_length=1024)
-            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-            return decoded
-    except Exception as e:
-        logger.debug(f"HF-like model attempt failed: {e}")
-    # 5) nothing matched — return original text
     return text
 def translate_to_en(text: str, src: str) -> str:
     if not text:
         return text
-    _ensure_loaded()
-    if _model is not None and hasattr(_model, "translate_to_en"):
-        try:
             return _model.translate_to_en(text, src)
-        except Exception:
-            pass
     return translate(text, src, "en")
 def translate_from_en(text: str, tgt: str) -> str:
     if not text:
         return text
-    _ensure_loaded()
-    if _model is not None and hasattr(_model, "translate_from_en"):
-        try:
             return _model.translate_from_en(text, tgt)
-        except Exception:
-            pass
     return translate(text, "en", tgt)
-# Optional: expose a detect function if the model has one, else None
-def detect_language(text: str) -> str:
-    _ensure_loaded()
     if _model is None:
         return None
-    for candidate in ("detect", "detect_language", "lang", "language"):
-        if hasattr(_model, candidate):
-            try:
-                return getattr(_model, candidate)(text)
-            except Exception:
-                pass
     return None
 if __name__ == "__main__":
-    # simple CLI debug
     import sys
-    _ensure_loaded()
     print("model_info:", model_info())
     if len(sys.argv) >= 4:
-        _, src, tgt, *txt = sys.argv
-        txt = " ".join(txt)
-        print("translate:", translate(txt, src, tgt))

+"""
+language.py — robust loader + adapter for language.bin
+This loader attempts multiple safe options to load a local language model file `language.bin`
+and adapt it into a small, predictable translation API:
+  - translate(text, src, tgt)
+  - translate_to_en(text, src)
+  - translate_from_en(text, tgt)
+  - detect(text) / detect_language(text) (if provided by model)
+  - model_info() for debugging
+Loading strategy (in order):
+  1. If a language.py module is present (importable) we prefer it (the app already tries this).
+  2. If language.bin exists:
+     - Try to detect if it's a safetensors file and (if safetensors is installed) attempt to load.
+     - Try torch.load with weights_only=True (safe for "weights-only" files).
+     - If that fails and you explicitly allow insecure loading, try torch.load(..., weights_only=False).
+       To allow this, set the environment variable LANGUAGE_LOAD_ALLOW_INSECURE=1.
+       NOTE: loading with weights_only=False may execute arbitrary code from the file. Only do this
+       when you trust the source of language.bin.
+     - Try pickle.load as a last attempt (may fail for many binary formats).
+  3. Fallback: no model loaded (the app will fall back to heuristics).
+Security note:
+  - Re-running torch.load with weights_only=False can run arbitrary code embedded in the file.
+    Only enable LANGUAGE_LOAD_ALLOW_INSECURE if you trust the file origin.
+"""
 from pathlib import Path
 import logging
+import importlib
+import io
+import sys
 logger = logging.getLogger("local_language")
 logger.setLevel(logging.INFO)
 _model = None
+_load_errors = []
+def _try_import_language_module():
+    # If a language.py exists, prefer importing it (app already tries this but we expose here)
+    try:
+        mod = importlib.import_module("language")
+        logger.info("Found importable language.py module; using it.")
+        return mod
+    except Exception as e:
+        _load_errors.append(("import_language_py", repr(e)))
+        return None
+def _is_likely_safetensors(path: Path) -> bool:
+    # Heuristic: safetensors files are usually small header-less binary; if file ends with .safetensors we try it.
+    return path.suffix == ".safetensors" or path.name.endswith(".safetensors")
+def _try_safetensors_load(path: Path):
+    try:
+        from safetensors.torch import load_file as st_load  # type: ignore
+    except Exception as e:
+        _load_errors.append(("safetensors_not_installed", repr(e)))
+        return None
+    try:
+        tensors = st_load(str(path))
+        logger.info("Loaded safetensors file into tensor dict (language.bin treated as safetensors).")
+        # Return the dict; user wrapper may adapt it.
+        return tensors
+    except Exception as e:
+        _load_errors.append(("safetensors_load_failed", repr(e)))
+        return None
+def _try_torch_load(path: Path, weights_only: bool):
     try:
         import torch
     except Exception as e:
+        _load_errors.append(("torch_not_installed", repr(e)))
+        return None
+    try:
+        # In PyTorch 2.6+, torch.load defaults weights_only=True. Passing explicitly for clarity.
+        obj = torch.load(str(path), map_location="cpu", weights_only=weights_only)
+        logger.info(f"torch.load succeeded (weights_only={weights_only}).")
+        return obj
+    except TypeError as e:
+        # Older torch versions don't accept weights_only kwarg; try without it (older API)
+        try:
+            obj = torch.load(str(path), map_location="cpu")
+            logger.info("torch.load succeeded (no weights_only argument supported by local torch).")
+            return obj
+        except Exception as e2:
+            _load_errors.append(("torch_load_typeerror_then_failed", repr(e2)))
+            return None
+    except Exception as e:
+        _load_errors.append((f"torch_load_failed_weights_only={weights_only}", repr(e)))
+        return None
+def _try_pickle_load(path: Path):
     try:
+        import pickle
         with open(path, "rb") as f:
+            obj = pickle.load(f)
+        logger.info("Loaded language.bin via pickle.")
+        return obj
     except Exception as e:
+        _load_errors.append(("pickle_load_failed", repr(e)))
+        return None
+def _attempt_load(path: Path):
+    # 1) Safetensors heuristics
+    if _is_likely_safetensors(path):
+        logger.info("language.bin looks like safetensors (by filename). Attempting safetensors load.")
+        obj = _try_safetensors_load(path)
+        if obj is not None:
+            return obj
+    # 2) Try torch.load in safe (weights-only) mode first (PyTorch 2.6+ default is weights_only=True)
+    obj = _try_torch_load(path, weights_only=True)
+    if obj is not None:
+        return obj
+    # 3) If env var allows insecure loading, try weights_only=False (dangerous)
+    allow_insecure = str(os.environ.get("LANGUAGE_LOAD_ALLOW_INSECURE", "")).lower() in ("1", "true", "yes")
+    if allow_insecure:
+        logger.warning("LANGUAGE_LOAD_ALLOW_INSECURE is set -> attempting torch.load with weights_only=False (INSECURE).")
+        obj = _try_torch_load(path, weights_only=False)
+        if obj is not None:
+            return obj
+        else:
+            logger.warning("torch.load(weights_only=False) failed or returned None.")
+    # 4) Try pickle as last resort
+    obj = _try_pickle_load(path)
+    if obj is not None:
+        return obj
+    return None
+def _load_language_bin_if_present():
     global _model
     p = Path("language.bin")
+    if not p.exists():
+        return None
+    logger.info("language.bin found; attempting to load with safe fallbacks...")
+    # Try multiple strategies
+    obj = _attempt_load(p)
+    if obj is None:
+        logger.warning("All attempts to load language.bin failed. See _load_errors for details.")
     else:
+        _model = obj
+    return obj
+def load():
+    """
+    Public loader. Returns the loaded model/object or None.
+    """
+    global _model
+    # Prefer an explicit language.py module if present on sys.path.
+    mod = _try_import_language_module()
+    if mod is not None:
+        _model = mod
+        return _model
+    # Attempt to load language.bin if present
+    obj = _load_language_bin_if_present()
+    return obj
+# Run load on import (app calls load_local_language_module separately too)
+try:
+    load()
+except Exception as e:
+    logger.warning(f"language.py loader encountered error during import: {e}")
+# --- Adapter / API functions the app expects --- #
+def model_info() -> dict:
+    """
+    Return a small summary about the loaded model/object to help debugging.
+    """
+    info = {"loaded": False, "type": None, "repr": None, "load_errors": list(_load_errors)[:20], "has_translate": False, "has_detect": False, "callable": False}
     if _model is None:
+        return info
+    info["loaded"] = True
     try:
+        info["type"] = type(_model).__name__
+    except Exception:
+        info["type"] = "<unknown>"
+    try:
+        info["repr"] = repr(_model)[:1000]
     except Exception:
         info["repr"] = "<unreprable>"
+    try:
+        info["has_translate"] = hasattr(_model, "translate")
+        info["has_translate_to_en"] = hasattr(_model, "translate_to_en")
+        info["has_translate_from_en"] = hasattr(_model, "translate_from_en")
+        info["has_detect"] = hasattr(_model, "detect") or hasattr(_model, "detect_language")
+        info["callable"] = callable(_model)
+        if hasattr(_model, "__dir__"):
+            try:
+                info["dir"] = [n for n in dir(_model) if not n.startswith("_")]
+            except Exception:
+                info["dir"] = []
+    except Exception:
+        pass
     return info
+def _safe_call_translate(text: str, src: str, tgt: str) -> str:
+    """
+    Try multiple call patterns to invoke translation functions on the loaded object.
+    Fall back to returning original text if nothing works.
+    """
     if _model is None:
         return text
+    # 1) Preferred explicit API
     try:
         if hasattr(_model, "translate"):
             try:
                 return _model.translate(text, src, tgt)
             except TypeError:
                 try:
+                    # some translate implementations take (text, "src->tgt")
                     return _model.translate(text, f"{src}->{tgt}")
                 except Exception:
                     pass
     except Exception as e:
+        logger.debug(f"_model.translate attempt failed: {e}")
+    # 2) Dedicated helpers
+    try:
+        if tgt.lower() in ("en", "eng") and hasattr(_model, "translate_to_en"):
+            return _model.translate_to_en(text, src)
+    except Exception as e:
+        logger.debug(f"_model.translate_to_en attempt failed: {e}")
+    try:
+        if src.lower() in ("en", "eng") and hasattr(_model, "translate_from_en"):
+            return _model.translate_from_en(text, tgt)
+    except Exception as e:
+        logger.debug(f"_model.translate_from_en attempt failed: {e}")
+    # 3) Callable model (call signature may vary)
     try:
         if callable(_model):
             try:
                 return _model(text, src, tgt)
             except TypeError:
                 try:
+                    return _model(text, src)  # maybe (text, src)
+                except TypeError:
+                    try:
+                        return _model(text)  # maybe (text)
+                    except Exception:
+                        pass
     except Exception as e:
+        logger.debug(f"_model callable attempts failed: {e}")
+    # 4) HF-style model object with attached tokenizer (best-effort)
+    try:
+        # model could be a dict of tensors (weights-only) - not directly usable for translation
+        tokenizer = getattr(_model, "tokenizer", None)
+        generate = getattr(_model, "generate", None)
+        if tokenizer and generate:
+            inputs = tokenizer([text], return_tensors="pt", truncation=True)
+            outputs = _model.generate(**inputs, max_length=1024)
+            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+            return decoded
+    except Exception as e:
+        logger.debug(f"_model HF-style generate attempt failed: {e}")
+    # 5) dict-like mapping (('src','tgt') -> fn or str)
     try:
         if isinstance(_model, dict):
             key = (src, tgt)
             if key in _model:
+                val = _model[key]
+                if callable(val):
+                    return val(text)
+                if isinstance(val, str):
+                    return val
             key2 = f"{src}->{tgt}"
             if key2 in _model:
                 val = _model[key2]
                 if isinstance(val, str):
                     return val
     except Exception as e:
+        logger.debug(f"_model dict-like attempt failed: {e}")
+    # Nothing worked: return input (no hallucination)
     return text
+def translate(text: str, src: str, tgt: str) -> str:
+    if not text:
+        return text
+    return _safe_call_translate(text, src or "und", tgt or "und")
 def translate_to_en(text: str, src: str) -> str:
     if not text:
         return text
+    # prefer dedicated helper if present
+    try:
+        if _model is not None and hasattr(_model, "translate_to_en"):
             return _model.translate_to_en(text, src)
+    except Exception:
+        pass
     return translate(text, src, "en")
 def translate_from_en(text: str, tgt: str) -> str:
     if not text:
         return text
+    try:
+        if _model is not None and hasattr(_model, "translate_from_en"):
             return _model.translate_from_en(text, tgt)
+    except Exception:
+        pass
     return translate(text, "en", tgt)
+def detect(text: str) -> str:
+    """
+    Call detection if the model exposes it. Returns None if not available.
+    """
+    if not text:
+        return None
     if _model is None:
         return None
+    try:
+        if hasattr(_model, "detect_language"):
+            return _model.detect_language(text)
+        if hasattr(_model, "detect"):
+            return _model.detect(text)
+    except Exception as e:
+        logger.debug(f"model detect attempt failed: {e}")
     return None
+# Small helper for CLI testing
 if __name__ == "__main__":
     import sys
     print("model_info:", model_info())
     if len(sys.argv) >= 4:
+        src = sys.argv[1]
+        tgt = sys.argv[2]
+        txt = " ".join(sys.argv[3:])
+        print("translate:", translate(txt, src, tgt))
+    else:
+        print("Usage: python language.py <src> <tgt> <text...>")
+        print("Example: python language.py es en 'hola mundo'")