Princeaka commited on
Commit
55ce9fc
·
verified ·
1 Parent(s): 769f383

Update language.py

Browse files
Files changed (1) hide show
  1. language.py +260 -102
language.py CHANGED
@@ -1,117 +1,276 @@
1
- # language.py — wrapper to expose a stable translation API for JusticeAI.
2
- # Tries to load language.bin (torch.load then pickle). Adapts common shapes and exposes:
3
- # - translate(text, src, tgt)
4
- # - translate_to_en(text, src)
5
- # - translate_from_en(text, tgt)
6
- # Also exposes model_info() for debugging/inspection.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from pathlib import Path
8
  import logging
9
- import pickle
10
- import types
 
11
 
12
  logger = logging.getLogger("local_language")
13
  logger.setLevel(logging.INFO)
14
 
15
  _model = None
16
- _model_meta = {}
17
 
18
- def _load_bin(path: Path):
19
- global _model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  try:
21
  import torch
22
- _model = torch.load(str(path), map_location="cpu")
23
- logger.info("Loaded language.bin via torch.load")
24
- return
25
  except Exception as e:
26
- logger.info(f"torch.load failed for language.bin: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  try:
 
28
  with open(path, "rb") as f:
29
- _model = pickle.load(f)
30
- logger.info("Loaded language.bin via pickle")
31
- return
32
  except Exception as e:
33
- logger.warning(f"pickle load failed for language.bin: {e}")
34
- _model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- def _ensure_loaded():
37
  global _model
38
- if _model is not None:
39
- return
40
  p = Path("language.bin")
41
- if p.exists():
42
- _load_bin(p)
 
 
 
 
 
43
  else:
44
- logger.info("language.bin not found in cwd")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- def model_info():
47
- _ensure_loaded()
 
 
 
48
  if _model is None:
49
- return {"loaded": False}
50
- info = {"loaded": True, "type": type(_model).__name__}
51
  try:
52
- info["repr"] = repr(_model)[:800]
 
 
 
 
53
  except Exception:
54
  info["repr"] = "<unreprable>"
55
- info["has_translate"] = hasattr(_model, "translate")
56
- info["has_translate_to_en"] = hasattr(_model, "translate_to_en")
57
- info["has_translate_from_en"] = hasattr(_model, "translate_from_en")
58
- info["callable"] = callable(_model)
59
- info["dir"] = [n for n in dir(_model) if not n.startswith("_")]
 
 
 
 
 
 
 
 
60
  return info
61
 
62
- def translate(text: str, src: str, tgt: str) -> str:
63
- _ensure_loaded()
64
- if not text:
65
- return text
 
66
  if _model is None:
67
  return text
68
-
69
- # 1) object has translate(text, src, tgt) or translate_to_en
70
  try:
71
  if hasattr(_model, "translate"):
72
  try:
73
  return _model.translate(text, src, tgt)
74
  except TypeError:
75
  try:
 
76
  return _model.translate(text, f"{src}->{tgt}")
77
  except Exception:
78
  pass
79
- if hasattr(_model, "translate_to_en") and tgt.lower() in ("en", "eng"):
80
- try:
81
- return _model.translate_to_en(text, src)
82
- except Exception:
83
- pass
84
- if hasattr(_model, "translate_from_en") and src.lower() in ("en", "eng"):
85
- try:
86
- return _model.translate_from_en(text, tgt)
87
- except Exception:
88
- pass
89
  except Exception as e:
90
- logger.debug(f"model.translate() attempt failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # 2) callable model (e.g., simple function)
93
  try:
94
  if callable(_model):
95
  try:
96
  return _model(text, src, tgt)
97
  except TypeError:
98
  try:
99
- return _model(text)
100
- except Exception:
101
- pass
 
 
 
102
  except Exception as e:
103
- logger.debug(f"callable model attempt failed: {e}")
104
 
105
- # 3) dict-like mapping (('src','tgt') -> function or string)
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  try:
107
  if isinstance(_model, dict):
108
  key = (src, tgt)
109
  if key in _model:
110
- fn = _model[key]
111
- if callable(fn):
112
- return fn(text)
113
- if isinstance(fn, str):
114
- return fn
115
  key2 = f"{src}->{tgt}"
116
  if key2 in _model:
117
  val = _model[key2]
@@ -120,64 +279,63 @@ def translate(text: str, src: str, tgt: str) -> str:
120
  if isinstance(val, str):
121
  return val
122
  except Exception as e:
123
- logger.debug(f"dict-like model attempt failed: {e}")
124
 
125
- # 4) HF-like object: has .generate and maybe a tokenizer at _model.tokenizer
126
- try:
127
- m = _model
128
- tokenizer = getattr(m, "tokenizer", None)
129
- if tokenizer and hasattr(m, "generate"):
130
- inputs = tokenizer([text], return_tensors="pt", truncation=True)
131
- outputs = m.generate(**inputs, max_length=1024)
132
- decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
133
- return decoded
134
- except Exception as e:
135
- logger.debug(f"HF-like model attempt failed: {e}")
136
-
137
- # 5) nothing matched — return original text
138
  return text
139
 
 
 
 
 
 
140
  def translate_to_en(text: str, src: str) -> str:
141
  if not text:
142
  return text
143
- _ensure_loaded()
144
- if _model is not None and hasattr(_model, "translate_to_en"):
145
- try:
146
  return _model.translate_to_en(text, src)
147
- except Exception:
148
- pass
149
  return translate(text, src, "en")
150
 
151
  def translate_from_en(text: str, tgt: str) -> str:
152
  if not text:
153
  return text
154
- _ensure_loaded()
155
- if _model is not None and hasattr(_model, "translate_from_en"):
156
- try:
157
  return _model.translate_from_en(text, tgt)
158
- except Exception:
159
- pass
160
  return translate(text, "en", tgt)
161
 
162
- # Optional: expose a detect function if the model has one, else None
163
- def detect_language(text: str) -> str:
164
- _ensure_loaded()
 
 
 
165
  if _model is None:
166
  return None
167
- for candidate in ("detect", "detect_language", "lang", "language"):
168
- if hasattr(_model, candidate):
169
- try:
170
- return getattr(_model, candidate)(text)
171
- except Exception:
172
- pass
 
173
  return None
174
 
 
175
  if __name__ == "__main__":
176
- # simple CLI debug
177
  import sys
178
- _ensure_loaded()
179
  print("model_info:", model_info())
180
  if len(sys.argv) >= 4:
181
- _, src, tgt, *txt = sys.argv
182
- txt = " ".join(txt)
183
- print("translate:", translate(txt, src, tgt))
 
 
 
 
 
1
+ """
2
+ language.py robust loader + adapter for language.bin
3
+
4
+ This loader attempts multiple safe options to load a local language model file `language.bin`
5
+ and adapt it into a small, predictable translation API:
6
+ - translate(text, src, tgt)
7
+ - translate_to_en(text, src)
8
+ - translate_from_en(text, tgt)
9
+ - detect(text) / detect_language(text) (if provided by model)
10
+ - model_info() for debugging
11
+
12
+ Loading strategy (in order):
13
+ 1. If a language.py module is present (importable) we prefer it (the app already tries this).
14
+ 2. If language.bin exists:
15
+ - Try to detect if it's a safetensors file and (if safetensors is installed) attempt to load.
16
+ - Try torch.load with weights_only=True (safe for "weights-only" files).
17
+ - If that fails and you explicitly allow insecure loading, try torch.load(..., weights_only=False).
18
+ To allow this, set the environment variable LANGUAGE_LOAD_ALLOW_INSECURE=1.
19
+ NOTE: loading with weights_only=False may execute arbitrary code from the file. Only do this
20
+ when you trust the source of language.bin.
21
+ - Try pickle.load as a last attempt (may fail for many binary formats).
22
+ 3. Fallback: no model loaded (the app will fall back to heuristics).
23
+
24
+ Security note:
25
+ - Re-running torch.load with weights_only=False can run arbitrary code embedded in the file.
26
+ Only enable LANGUAGE_LOAD_ALLOW_INSECURE if you trust the file origin.
27
+ """
28
+
29
  from pathlib import Path
30
  import logging
31
+ import importlib
32
+ import io
33
+ import sys
34
 
35
  logger = logging.getLogger("local_language")
36
  logger.setLevel(logging.INFO)
37
 
38
  _model = None
39
+ _load_errors = []
40
 
41
+ def _try_import_language_module():
42
+ # If a language.py exists, prefer importing it (app already tries this but we expose here)
43
+ try:
44
+ mod = importlib.import_module("language")
45
+ logger.info("Found importable language.py module; using it.")
46
+ return mod
47
+ except Exception as e:
48
+ _load_errors.append(("import_language_py", repr(e)))
49
+ return None
50
+
51
+ def _is_likely_safetensors(path: Path) -> bool:
52
+ # Heuristic: safetensors files are usually small header-less binary; if file ends with .safetensors we try it.
53
+ return path.suffix == ".safetensors" or path.name.endswith(".safetensors")
54
+
55
+ def _try_safetensors_load(path: Path):
56
+ try:
57
+ from safetensors.torch import load_file as st_load # type: ignore
58
+ except Exception as e:
59
+ _load_errors.append(("safetensors_not_installed", repr(e)))
60
+ return None
61
+ try:
62
+ tensors = st_load(str(path))
63
+ logger.info("Loaded safetensors file into tensor dict (language.bin treated as safetensors).")
64
+ # Return the dict; user wrapper may adapt it.
65
+ return tensors
66
+ except Exception as e:
67
+ _load_errors.append(("safetensors_load_failed", repr(e)))
68
+ return None
69
+
70
+ def _try_torch_load(path: Path, weights_only: bool):
71
  try:
72
  import torch
 
 
 
73
  except Exception as e:
74
+ _load_errors.append(("torch_not_installed", repr(e)))
75
+ return None
76
+ try:
77
+ # In PyTorch 2.6+, torch.load defaults weights_only=True. Passing explicitly for clarity.
78
+ obj = torch.load(str(path), map_location="cpu", weights_only=weights_only)
79
+ logger.info(f"torch.load succeeded (weights_only={weights_only}).")
80
+ return obj
81
+ except TypeError as e:
82
+ # Older torch versions don't accept weights_only kwarg; try without it (older API)
83
+ try:
84
+ obj = torch.load(str(path), map_location="cpu")
85
+ logger.info("torch.load succeeded (no weights_only argument supported by local torch).")
86
+ return obj
87
+ except Exception as e2:
88
+ _load_errors.append(("torch_load_typeerror_then_failed", repr(e2)))
89
+ return None
90
+ except Exception as e:
91
+ _load_errors.append((f"torch_load_failed_weights_only={weights_only}", repr(e)))
92
+ return None
93
+
94
+ def _try_pickle_load(path: Path):
95
  try:
96
+ import pickle
97
  with open(path, "rb") as f:
98
+ obj = pickle.load(f)
99
+ logger.info("Loaded language.bin via pickle.")
100
+ return obj
101
  except Exception as e:
102
+ _load_errors.append(("pickle_load_failed", repr(e)))
103
+ return None
104
+
105
+ def _attempt_load(path: Path):
106
+ # 1) Safetensors heuristics
107
+ if _is_likely_safetensors(path):
108
+ logger.info("language.bin looks like safetensors (by filename). Attempting safetensors load.")
109
+ obj = _try_safetensors_load(path)
110
+ if obj is not None:
111
+ return obj
112
+
113
+ # 2) Try torch.load in safe (weights-only) mode first (PyTorch 2.6+ default is weights_only=True)
114
+ obj = _try_torch_load(path, weights_only=True)
115
+ if obj is not None:
116
+ return obj
117
+
118
+ # 3) If env var allows insecure loading, try weights_only=False (dangerous)
119
+ allow_insecure = str(os.environ.get("LANGUAGE_LOAD_ALLOW_INSECURE", "")).lower() in ("1", "true", "yes")
120
+ if allow_insecure:
121
+ logger.warning("LANGUAGE_LOAD_ALLOW_INSECURE is set -> attempting torch.load with weights_only=False (INSECURE).")
122
+ obj = _try_torch_load(path, weights_only=False)
123
+ if obj is not None:
124
+ return obj
125
+ else:
126
+ logger.warning("torch.load(weights_only=False) failed or returned None.")
127
+
128
+ # 4) Try pickle as last resort
129
+ obj = _try_pickle_load(path)
130
+ if obj is not None:
131
+ return obj
132
+
133
+ return None
134
 
135
+ def _load_language_bin_if_present():
136
  global _model
 
 
137
  p = Path("language.bin")
138
+ if not p.exists():
139
+ return None
140
+ logger.info("language.bin found; attempting to load with safe fallbacks...")
141
+ # Try multiple strategies
142
+ obj = _attempt_load(p)
143
+ if obj is None:
144
+ logger.warning("All attempts to load language.bin failed. See _load_errors for details.")
145
  else:
146
+ _model = obj
147
+ return obj
148
+
149
+ def load():
150
+ """
151
+ Public loader. Returns the loaded model/object or None.
152
+ """
153
+ global _model
154
+ # Prefer an explicit language.py module if present on sys.path.
155
+ mod = _try_import_language_module()
156
+ if mod is not None:
157
+ _model = mod
158
+ return _model
159
+ # Attempt to load language.bin if present
160
+ obj = _load_language_bin_if_present()
161
+ return obj
162
+
163
+ # Run load on import (app calls load_local_language_module separately too)
164
+ try:
165
+ load()
166
+ except Exception as e:
167
+ logger.warning(f"language.py loader encountered error during import: {e}")
168
+
169
+ # --- Adapter / API functions the app expects --- #
170
 
171
+ def model_info() -> dict:
172
+ """
173
+ Return a small summary about the loaded model/object to help debugging.
174
+ """
175
+ info = {"loaded": False, "type": None, "repr": None, "load_errors": list(_load_errors)[:20], "has_translate": False, "has_detect": False, "callable": False}
176
  if _model is None:
177
+ return info
178
+ info["loaded"] = True
179
  try:
180
+ info["type"] = type(_model).__name__
181
+ except Exception:
182
+ info["type"] = "<unknown>"
183
+ try:
184
+ info["repr"] = repr(_model)[:1000]
185
  except Exception:
186
  info["repr"] = "<unreprable>"
187
+ try:
188
+ info["has_translate"] = hasattr(_model, "translate")
189
+ info["has_translate_to_en"] = hasattr(_model, "translate_to_en")
190
+ info["has_translate_from_en"] = hasattr(_model, "translate_from_en")
191
+ info["has_detect"] = hasattr(_model, "detect") or hasattr(_model, "detect_language")
192
+ info["callable"] = callable(_model)
193
+ if hasattr(_model, "__dir__"):
194
+ try:
195
+ info["dir"] = [n for n in dir(_model) if not n.startswith("_")]
196
+ except Exception:
197
+ info["dir"] = []
198
+ except Exception:
199
+ pass
200
  return info
201
 
202
+ def _safe_call_translate(text: str, src: str, tgt: str) -> str:
203
+ """
204
+ Try multiple call patterns to invoke translation functions on the loaded object.
205
+ Fall back to returning original text if nothing works.
206
+ """
207
  if _model is None:
208
  return text
209
+ # 1) Preferred explicit API
 
210
  try:
211
  if hasattr(_model, "translate"):
212
  try:
213
  return _model.translate(text, src, tgt)
214
  except TypeError:
215
  try:
216
+ # some translate implementations take (text, "src->tgt")
217
  return _model.translate(text, f"{src}->{tgt}")
218
  except Exception:
219
  pass
 
 
 
 
 
 
 
 
 
 
220
  except Exception as e:
221
+ logger.debug(f"_model.translate attempt failed: {e}")
222
+
223
+ # 2) Dedicated helpers
224
+ try:
225
+ if tgt.lower() in ("en", "eng") and hasattr(_model, "translate_to_en"):
226
+ return _model.translate_to_en(text, src)
227
+ except Exception as e:
228
+ logger.debug(f"_model.translate_to_en attempt failed: {e}")
229
+ try:
230
+ if src.lower() in ("en", "eng") and hasattr(_model, "translate_from_en"):
231
+ return _model.translate_from_en(text, tgt)
232
+ except Exception as e:
233
+ logger.debug(f"_model.translate_from_en attempt failed: {e}")
234
 
235
+ # 3) Callable model (call signature may vary)
236
  try:
237
  if callable(_model):
238
  try:
239
  return _model(text, src, tgt)
240
  except TypeError:
241
  try:
242
+ return _model(text, src) # maybe (text, src)
243
+ except TypeError:
244
+ try:
245
+ return _model(text) # maybe (text)
246
+ except Exception:
247
+ pass
248
  except Exception as e:
249
+ logger.debug(f"_model callable attempts failed: {e}")
250
 
251
+ # 4) HF-style model object with attached tokenizer (best-effort)
252
+ try:
253
+ # model could be a dict of tensors (weights-only) - not directly usable for translation
254
+ tokenizer = getattr(_model, "tokenizer", None)
255
+ generate = getattr(_model, "generate", None)
256
+ if tokenizer and generate:
257
+ inputs = tokenizer([text], return_tensors="pt", truncation=True)
258
+ outputs = _model.generate(**inputs, max_length=1024)
259
+ decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
260
+ return decoded
261
+ except Exception as e:
262
+ logger.debug(f"_model HF-style generate attempt failed: {e}")
263
+
264
+ # 5) dict-like mapping (('src','tgt') -> fn or str)
265
  try:
266
  if isinstance(_model, dict):
267
  key = (src, tgt)
268
  if key in _model:
269
+ val = _model[key]
270
+ if callable(val):
271
+ return val(text)
272
+ if isinstance(val, str):
273
+ return val
274
  key2 = f"{src}->{tgt}"
275
  if key2 in _model:
276
  val = _model[key2]
 
279
  if isinstance(val, str):
280
  return val
281
  except Exception as e:
282
+ logger.debug(f"_model dict-like attempt failed: {e}")
283
 
284
+ # Nothing worked: return input (no hallucination)
 
 
 
 
 
 
 
 
 
 
 
 
285
  return text
286
 
287
+ def translate(text: str, src: str, tgt: str) -> str:
288
+ if not text:
289
+ return text
290
+ return _safe_call_translate(text, src or "und", tgt or "und")
291
+
292
  def translate_to_en(text: str, src: str) -> str:
293
  if not text:
294
  return text
295
+ # prefer dedicated helper if present
296
+ try:
297
+ if _model is not None and hasattr(_model, "translate_to_en"):
298
  return _model.translate_to_en(text, src)
299
+ except Exception:
300
+ pass
301
  return translate(text, src, "en")
302
 
303
  def translate_from_en(text: str, tgt: str) -> str:
304
  if not text:
305
  return text
306
+ try:
307
+ if _model is not None and hasattr(_model, "translate_from_en"):
 
308
  return _model.translate_from_en(text, tgt)
309
+ except Exception:
310
+ pass
311
  return translate(text, "en", tgt)
312
 
313
+ def detect(text: str) -> str:
314
+ """
315
+ Call detection if the model exposes it. Returns None if not available.
316
+ """
317
+ if not text:
318
+ return None
319
  if _model is None:
320
  return None
321
+ try:
322
+ if hasattr(_model, "detect_language"):
323
+ return _model.detect_language(text)
324
+ if hasattr(_model, "detect"):
325
+ return _model.detect(text)
326
+ except Exception as e:
327
+ logger.debug(f"model detect attempt failed: {e}")
328
  return None
329
 
330
+ # Small helper for CLI testing
331
  if __name__ == "__main__":
 
332
  import sys
 
333
  print("model_info:", model_info())
334
  if len(sys.argv) >= 4:
335
+ src = sys.argv[1]
336
+ tgt = sys.argv[2]
337
+ txt = " ".join(sys.argv[3:])
338
+ print("translate:", translate(txt, src, tgt))
339
+ else:
340
+ print("Usage: python language.py <src> <tgt> <text...>")
341
+ print("Example: python language.py es en 'hola mundo'")