neuralworm commited on
Commit
0d9095f
·
1 Parent(s): 0a1cc8d

fix for gemma

Browse files
bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ
 
bp_phi/__pycache__/runner.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ
 
bp_phi/llm_iface.py CHANGED
@@ -5,7 +5,7 @@ import torch, random, numpy as np
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
6
  from typing import List, Optional
7
 
8
- DEBUG = 1
9
 
10
  def dbg(*args):
11
  if DEBUG:
@@ -16,51 +16,60 @@ class LLM:
16
  self.model_id = model_id
17
  self.seed = seed
18
 
 
 
 
 
 
 
 
 
 
 
19
  set_seed(seed)
 
20
  token = os.environ.get("HF_TOKEN")
 
 
21
 
22
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
23
- if self.tokenizer.pad_token is None:
24
- self.tokenizer.pad_token = self.tokenizer.eos_token
25
-
26
  kwargs = {}
27
- if torch.cuda.is_available():
28
- kwargs["torch_dtype"] = torch.bfloat16
29
 
30
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
31
  self.model.eval()
 
32
 
33
- dbg(f"Loaded model: {model_id}")
34
 
35
- def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
 
 
36
  set_seed(self.seed)
37
 
38
- messages = [
39
- {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
40
- ]
41
-
42
- # Using a simpler user-only template that is robust for Gemma
43
- prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
 
45
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
46
  input_token_length = inputs.input_ids.shape[1]
47
 
48
  with torch.no_grad():
49
- terminators = [
50
- self.tokenizer.eos_token_id,
51
- self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
52
- ]
53
-
54
  out = self.model.generate(
55
  **inputs,
56
- do_sample=(temperature > 0 and temperature < 1.0),
57
- temperature=max(temperature, 0.01),
58
- max_new_tokens=200,
59
- eos_token_id=terminators,
 
60
  pad_token_id=self.tokenizer.eos_token_id
61
  )
62
 
63
- completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
 
64
 
65
- dbg("Cleaned Agent Completion:", completion)
66
- return completion
 
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
6
  from typing import List, Optional
7
 
8
+ DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
9
 
10
  def dbg(*args):
11
  if DEBUG:
 
16
  self.model_id = model_id
17
  self.seed = seed
18
 
19
+ # Set all seeds for reproducibility
20
+ random.seed(seed)
21
+ np.random.seed(seed)
22
+ torch.manual_seed(seed)
23
+ if torch.cuda.is_available():
24
+ torch.cuda.manual_seed_all(seed)
25
+ try:
26
+ torch.use_deterministic_algorithms(True, warn_only=True)
27
+ except Exception as e:
28
+ dbg(f"Could not set deterministic algorithms: {e}")
29
  set_seed(seed)
30
+
31
  token = os.environ.get("HF_TOKEN")
32
+ if not token and ("gemma-3" in model_id or "llama" in model_id):
33
+ print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
34
 
35
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
 
 
 
36
  kwargs = {}
37
+ if dtype == "float16": kwargs["torch_dtype"] = torch.float16
38
+ elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
39
 
40
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
41
  self.model.eval()
42
+ self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
43
 
44
+ dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
45
 
46
+ def generate_json(self, system_prompt: str, user_prompt: str,
47
+ max_new_tokens: int = 256, temperature: float = 0.7,
48
+ top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
49
  set_seed(self.seed)
50
 
51
+ if self.is_instruction_tuned:
52
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
53
+ prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
54
+ else:
55
+ prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
 
56
 
57
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
58
  input_token_length = inputs.input_ids.shape[1]
59
 
60
  with torch.no_grad():
 
 
 
 
 
61
  out = self.model.generate(
62
  **inputs,
63
+ do_sample=(temperature > 0),
64
+ temperature=temperature,
65
+ top_p=top_p,
66
+ max_new_tokens=max_new_tokens,
67
+ num_return_sequences=num_return_sequences,
68
  pad_token_id=self.tokenizer.eos_token_id
69
  )
70
 
71
+ new_tokens = out[:, input_token_length:]
72
+ completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
73
 
74
+ dbg("Cleaned model completions:", completions)
75
+ return completions