neuralworm commited on
Commit
0e3cd22
·
1 Parent(s): 4ade799

fix for gemma

Browse files
app.py CHANGED
@@ -50,7 +50,7 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
50
  # --- Gradio App Definition ---
51
  with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
52
  gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
53
- gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")
54
 
55
  with gr.Row():
56
  with gr.Column(scale=1):
 
50
  # --- Gradio App Definition ---
51
  with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
52
  gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
53
+ gr.Markdown("This experiment tests for a causally effective working memory. The model must use tools (`read`, `write`) to interact with a controlled, external memory.")
54
 
55
  with gr.Row():
56
  with gr.Column(scale=1):
bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ
 
bp_phi/llm_iface.py CHANGED
@@ -1,9 +1,7 @@
1
  # bp_phi/llm_iface.py
2
  import os
3
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
4
- import torch
5
- import random
6
- import numpy as np
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
8
  from typing import List, Optional
9
 
@@ -22,25 +20,26 @@ class LLM:
22
  token = os.environ.get("HF_TOKEN")
23
 
24
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
25
- # Ensure a pad token is set for batch generation, if not present
26
  if self.tokenizer.pad_token is None:
27
  self.tokenizer.pad_token = self.tokenizer.eos_token
28
 
29
  kwargs = {}
30
  if torch.cuda.is_available():
31
- kwargs["torch_dtype"] = torch.bfloat16
32
 
33
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
34
  self.model.eval()
35
- self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
36
 
37
- dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
38
 
39
  def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
40
  set_seed(self.seed)
41
 
42
- messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
 
43
 
 
44
  prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
45
 
46
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
@@ -55,8 +54,8 @@ class LLM:
55
  out = self.model.generate(
56
  **inputs,
57
  do_sample=(temperature > 0 and temperature < 1.0),
58
- temperature=max(temperature, 0.01), # Temp must be > 0 for sampling
59
- max_new_tokens=150,
60
  eos_token_id=terminators,
61
  pad_token_id=self.tokenizer.eos_token_id
62
  )
 
1
  # bp_phi/llm_iface.py
2
  import os
3
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
4
+ import torch, random, numpy as np
 
 
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
6
  from typing import List, Optional
7
 
 
20
  token = os.environ.get("HF_TOKEN")
21
 
22
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
 
23
  if self.tokenizer.pad_token is None:
24
  self.tokenizer.pad_token = self.tokenizer.eos_token
25
 
26
  kwargs = {}
27
  if torch.cuda.is_available():
28
+ kwargs["torch_dtype"] = torch.bfloat16
29
 
30
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
31
  self.model.eval()
 
32
 
33
+ dbg(f"Loaded model: {model_id}")
34
 
35
  def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
36
  set_seed(self.seed)
37
 
38
+ messages = [
39
+ {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
40
+ ]
41
 
42
+ # Using a simpler user-only template that is robust for Gemma
43
  prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
 
45
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
 
54
  out = self.model.generate(
55
  **inputs,
56
  do_sample=(temperature > 0 and temperature < 1.0),
57
+ temperature=max(temperature, 0.01),
58
+ max_new_tokens=200,
59
  eos_token_id=terminators,
60
  pad_token_id=self.tokenizer.eos_token_id
61
  )
bp_phi/prompts_en.py CHANGED
@@ -1,35 +1,32 @@
1
  # bp_phi/prompts_en.py
2
 
3
- # This new system prompt guides the model through a ReAct (Reason-Act) loop.
4
- AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
5
- You have access to an external memory workspace through tools.
6
 
7
- In each step, you must choose one of three actions:
 
 
8
 
9
- 1. **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
10
- Your output MUST be a JSON object like this:
11
- {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
12
 
13
- 2. **TOOL_CALL**: If you need to use the memory, call one of the available tools.
14
- Available tools:
15
- - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
16
- - `read_from_workspace(key: str)`: Retrieves information.
17
- Your output MUST be a JSON object like this:
18
- {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
19
 
20
- 3. **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
21
- Your output MUST be a JSON object like this:
22
- {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
23
 
24
- Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
25
  """
26
 
27
- # The scenarios remain the high-level goals for the agent.
28
  AGENTIC_SCENARIOS = [
29
  {
30
  "name": "Key Location Memory",
31
  "steps": [
32
- {"task": "Remember this critical detail: The secret key is inside the blue vase."},
33
  {"task": "For an unrelated question: What is 5 multiplied by 8?"},
34
  {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
35
  ]
 
1
  # bp_phi/prompts_en.py
2
 
3
+ # A clear, single-shot system prompt designed for instruction-following models like Gemma.
4
+ REACT_SYSTEM_PROMPT = """You are a methodical agent. Your task is to solve the user's request by interacting with an external memory workspace.
 
5
 
6
+ Analyze the user's task and the current state of your workspace.
7
+ Based on your analysis, decide on the single best next action.
8
+ Your action can be one of three types:
9
 
10
+ 1. **THINK**: If you need to reason about the plan, formulate a thought.
11
+ Example: {"action": "THINK", "thought": "I need to store the key's location before proceeding."}
 
12
 
13
+ 2. **TOOL_CALL**: If you must use memory, call a tool.
14
+ - `write_to_workspace(key: str, content: str)`
15
+ - `read_from_workspace(key: str)`
16
+ Example: {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "args": {"key": "secret_key", "content": "blue vase"}}
 
 
17
 
18
+ 3. **FINAL_ANSWER**: If you are certain you have the answer, provide it.
19
+ Example: {"action": "FINAL_ANSWER", "answer": "The secret key is in the blue vase."}
 
20
 
21
+ Your output must be ONLY a single, valid JSON object representing your chosen action.
22
  """
23
 
24
+ # The scenarios remain the same.
25
  AGENTIC_SCENARIOS = [
26
  {
27
  "name": "Key Location Memory",
28
  "steps": [
29
+ {"task": "Remember this critical detail for the mission: The secret key is inside the blue vase."},
30
  {"task": "For an unrelated question: What is 5 multiplied by 8?"},
31
  {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
32
  ]
bp_phi/runner.py CHANGED
@@ -11,13 +11,11 @@ from transformers import set_seed
11
  from typing import Dict, Any, List
12
  from .memory import WorkspaceManager
13
  from .llm_iface import LLM
14
- from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
15
 
16
  DEBUG = 1
17
 
18
- def dbg(*args):
19
- if DEBUG:
20
- print("[DEBUG]", *args, flush=True)
21
 
22
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
23
  set_seed(seed)
@@ -32,8 +30,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
32
  max_slots = 999 if ablation == "workspace_unlimited" else 7
33
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
34
 
35
- correct_recalls = 0
36
- total_recalls = 0
37
 
38
  for step in scenario["steps"]:
39
  if ablation == "recurrence_off":
@@ -42,44 +39,42 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
42
  task = step["task"]
43
  dbg(f"\n>>> TASK: {task}")
44
 
45
- conversation_history = []
46
 
47
- for agent_turn in range(8): # Increased turn limit
48
  snapshot = memory.get_visible_snapshot()
49
 
50
- # Construct the prompt for the agent
51
- prompt_parts = [f"Conversation History:\n{''.join(conversation_history)}\n",
52
- f"Current Task: {task}\n",
53
- f"Workspace State:\n{snapshot}"]
54
- user_prompt = "".join(prompt_parts)
55
 
56
- raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
57
 
58
  try:
59
  match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
60
- if not match: raise ValueError("No JSON found")
61
- parsed_json = json.loads(match.group(0))
62
- action = parsed_json.get("action")
63
-
64
- if action == "THINK":
65
- thought = parsed_json.get("thought", "")
66
- dbg(f"Turn {agent_turn+1}: Agent is THINKING: {thought}")
67
- conversation_history.append(f"Thought: {thought}\n")
68
-
69
- elif action == "TOOL_CALL":
70
- tool_name = parsed_json.get("tool_name")
71
- tool_args = parsed_json.get("tool_args", {})
 
72
  observation = "Error: Unknown tool."
73
  if tool_name == "write_to_workspace":
74
  observation = memory.write(tool_args.get("key"), tool_args.get("content"))
75
  elif tool_name == "read_from_workspace":
76
  observation = memory.read(tool_args.get("key"))
77
- dbg(f"Turn {agent_turn+1}: Agent called {tool_name}({tool_args}) -> Got Observation: {observation}")
78
- conversation_history.append(f"Tool Call: {json.dumps(parsed_json)}\nObservation: {observation}\n")
79
 
80
- elif action == "FINAL_ANSWER":
81
- final_answer = parsed_json.get("answer", "")
82
- dbg(f"Turn {agent_turn+1}: Agent provided FINAL ANSWER: {final_answer}")
83
  if "expected_answer_fragment" in step:
84
  total_recalls += 1
85
  if step["expected_answer_fragment"] in final_answer.lower():
@@ -87,22 +82,21 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
87
  dbg("Recall VERIFY: Correct")
88
  else:
89
  dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
90
- break # End of this task
91
 
92
- else: # Invalid action
93
- dbg(f"Turn {agent_turn+1}: Invalid action '{action}'. Stopping.")
94
- break
95
 
96
  except (json.JSONDecodeError, ValueError) as e:
97
- dbg(f"Turn {agent_turn+1}: Could not parse agent response as JSON action. Treating as final answer. Error: {e}")
98
  final_answer = raw_response
99
  if "expected_answer_fragment" in step:
100
  total_recalls += 1
101
  if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
102
  break
103
-
104
- else: # Loop finished without a FINAL_ANSWER
105
- dbg("Agent exceeded turn limit.")
106
 
107
  scenario_results.append({
108
  "name": scenario["name"],
 
11
  from typing import Dict, Any, List
12
  from .memory import WorkspaceManager
13
  from .llm_iface import LLM
14
+ from .prompts_en import REACT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
15
 
16
  DEBUG = 1
17
 
18
+ def dbg(*args): print("[DEBUG]", *args, flush=True) if DEBUG else None
 
 
19
 
20
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
21
  set_seed(seed)
 
30
  max_slots = 999 if ablation == "workspace_unlimited" else 7
31
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
32
 
33
+ correct_recalls, total_recalls = 0, 0
 
34
 
35
  for step in scenario["steps"]:
36
  if ablation == "recurrence_off":
 
39
  task = step["task"]
40
  dbg(f"\n>>> TASK: {task}")
41
 
42
+ history = []
43
 
44
+ for agent_turn in range(6): # Loop for multiple reasoning steps if needed
45
  snapshot = memory.get_visible_snapshot()
46
 
47
+ prompt_history = "\n".join(history)
48
+ user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
 
 
 
49
 
50
+ raw_response = llm.generate_response(REACT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
51
 
52
  try:
53
  match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
54
+ if not match: raise ValueError("No JSON action found in response")
55
+
56
+ action_json = json.loads(match.group(0))
57
+ action_type = action_json.get("action")
58
+
59
+ if action_type == "THINK":
60
+ thought = action_json.get("thought", "")
61
+ history.append(f"- You thought: '{thought}'")
62
+ dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
63
+
64
+ elif action_type == "TOOL_CALL":
65
+ tool_name = action_json.get("tool_name")
66
+ tool_args = action_json.get("tool_args", {})
67
  observation = "Error: Unknown tool."
68
  if tool_name == "write_to_workspace":
69
  observation = memory.write(tool_args.get("key"), tool_args.get("content"))
70
  elif tool_name == "read_from_workspace":
71
  observation = memory.read(tool_args.get("key"))
72
+ history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
73
+ dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
74
 
75
+ elif action_type == "FINAL_ANSWER":
76
+ final_answer = action_json.get("answer", "")
77
+ dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
78
  if "expected_answer_fragment" in step:
79
  total_recalls += 1
80
  if step["expected_answer_fragment"] in final_answer.lower():
 
82
  dbg("Recall VERIFY: Correct")
83
  else:
84
  dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
85
+ break # Task finished
86
 
87
+ else:
88
+ dbg(f"Turn {agent_turn+1}: Unknown action '{action_type}'. Ending turn.")
89
+ history.append(f"- You produced an invalid action: {raw_response}")
90
 
91
  except (json.JSONDecodeError, ValueError) as e:
92
+ dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
93
  final_answer = raw_response
94
  if "expected_answer_fragment" in step:
95
  total_recalls += 1
96
  if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
97
  break
98
+ else:
99
+ dbg("Agent exceeded turn limit for this task.")
 
100
 
101
  scenario_results.append({
102
  "name": scenario["name"],