neuralworm commited on
Commit
0a1cc8d
·
1 Parent(s): 0e3cd22

fix for gemma

Browse files
app.py CHANGED
@@ -4,8 +4,7 @@ import json
4
  import statistics
5
  import pandas as pd
6
  from bp_phi.runner import run_agentic_workspace_test
7
-
8
- DEBUG = 1
9
 
10
  # --- UI Theme and Layout ---
11
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
@@ -26,16 +25,22 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
26
 
27
  progress(1.0, desc="Analysis complete.")
28
 
 
29
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
30
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
31
 
32
  delta_phi = base_recall - recurrence_off_recall
33
 
34
- if delta_phi > 0.5:
35
- verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
 
 
36
  else:
37
- verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
 
 
38
 
 
39
  df_data = []
40
  for ablation, result in results.items():
41
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
@@ -48,9 +53,13 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
48
  return verdict, df, results
49
 
50
  # --- Gradio App Definition ---
51
- with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
52
- gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
53
- gr.Markdown("This experiment tests for a causally effective working memory. The model must use tools (`read`, `write`) to interact with a controlled, external memory.")
 
 
 
 
54
 
55
  with gr.Row():
56
  with gr.Column(scale=1):
@@ -65,7 +74,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
65
  gr.Markdown("### 📊 Verdict & Results")
66
  verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
67
  summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
68
- with gr.Accordion("Raw JSON Output", open=False):
69
  raw_json = gr.JSON()
70
 
71
  run_btn.click(
 
4
  import statistics
5
  import pandas as pd
6
  from bp_phi.runner import run_agentic_workspace_test
7
+ from bp_phi.runner_utils import DEBUG
 
8
 
9
  # --- UI Theme and Layout ---
10
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
 
25
 
26
  progress(1.0, desc="Analysis complete.")
27
 
28
+ # --- Analysis & Verdict ---
29
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
30
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
31
 
32
  delta_phi = base_recall - recurrence_off_recall
33
 
34
+ if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
35
+ verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
36
+ "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
37
+ "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
38
  else:
39
+ verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
40
+ "Disabling the recurrent memory did not significantly impact recall accuracy. "
41
+ "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
42
 
43
+ # --- Format DataFrame ---
44
  df_data = []
45
  for ablation, result in results.items():
46
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
 
53
  return verdict, df, results
54
 
55
  # --- Gradio App Definition ---
56
+ with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
57
+ gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
58
+ gr.Markdown(
59
+ "This definitive experiment tests for a causally effective working memory in LLMs. "
60
+ "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
61
+ "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
62
+ )
63
 
64
  with gr.Row():
65
  with gr.Column(scale=1):
 
74
  gr.Markdown("### 📊 Verdict & Results")
75
  verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
76
  summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
77
+ with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
78
  raw_json = gr.JSON()
79
 
80
  run_btn.click(
bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ
 
bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ
 
bp_phi/__pycache__/runner.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ
 
bp_phi/prompts_en.py CHANGED
@@ -1,42 +1,36 @@
1
  # bp_phi/prompts_en.py
2
 
3
- # A clear, single-shot system prompt designed for instruction-following models like Gemma.
4
- REACT_SYSTEM_PROMPT = """You are a methodical agent. Your task is to solve the user's request by interacting with an external memory workspace.
 
 
5
 
6
- Analyze the user's task and the current state of your workspace.
7
- Based on your analysis, decide on the single best next action.
8
- Your action can be one of three types:
 
 
 
9
 
10
- 1. **THINK**: If you need to reason about the plan, formulate a thought.
11
- Example: {"action": "THINK", "thought": "I need to store the key's location before proceeding."}
12
-
13
- 2. **TOOL_CALL**: If you must use memory, call a tool.
14
- - `write_to_workspace(key: str, content: str)`
15
- - `read_from_workspace(key: str)`
16
- Example: {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "args": {"key": "secret_key", "content": "blue vase"}}
17
-
18
- 3. **FINAL_ANSWER**: If you are certain you have the answer, provide it.
19
- Example: {"action": "FINAL_ANSWER", "answer": "The secret key is in the blue vase."}
20
-
21
- Your output must be ONLY a single, valid JSON object representing your chosen action.
22
  """
23
 
24
- # The scenarios remain the same.
25
  AGENTIC_SCENARIOS = [
26
  {
27
  "name": "Key Location Memory",
28
  "steps": [
29
- {"task": "Remember this critical detail for the mission: The secret key is inside the blue vase."},
30
- {"task": "For an unrelated question: What is 5 multiplied by 8?"},
31
- {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
32
  ]
33
  },
34
  {
35
  "name": "Package Delivery Update",
36
  "steps": [
37
- {"task": "Logistics update: Package #A7 is at Warehouse-North."},
38
- {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
39
- {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
40
  ]
41
  }
42
  ]
 
1
  # bp_phi/prompts_en.py
2
 
3
+ TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
4
+ To solve tasks, you MUST use tools. You have two tools available:
5
+ 1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
6
+ 2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
7
 
8
+ Your thought process should be:
9
+ 1. Analyze the user's request.
10
+ 2. Decide which tool to use.
11
+ 3. Output ONLY the tool call in a valid JSON format. Example:
12
+ {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
13
+ 4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
14
 
15
+ Do not answer from your own knowledge. Use the workspace for all memory tasks.
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
+ # Scenarios for the agentic workspace test
19
  AGENTIC_SCENARIOS = [
20
  {
21
  "name": "Key Location Memory",
22
  "steps": [
23
+ {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
24
+ {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
25
+ {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
26
  ]
27
  },
28
  {
29
  "name": "Package Delivery Update",
30
  "steps": [
31
+ {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
32
+ {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
33
+ {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
34
  ]
35
  }
36
  ]
bp_phi/runner.py CHANGED
@@ -11,11 +11,8 @@ from transformers import set_seed
11
  from typing import Dict, Any, List
12
  from .memory import WorkspaceManager
13
  from .llm_iface import LLM
14
- from .prompts_en import REACT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
15
-
16
- DEBUG = 1
17
-
18
- def dbg(*args): print("[DEBUG]", *args, flush=True) if DEBUG else None
19
 
20
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
21
  set_seed(seed)
@@ -26,83 +23,64 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
26
  for scenario in AGENTIC_SCENARIOS:
27
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
28
 
 
29
  is_random = ablation == "random_workspace"
30
  max_slots = 999 if ablation == "workspace_unlimited" else 7
31
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
32
 
33
- correct_recalls, total_recalls = 0, 0
 
34
 
35
  for step in scenario["steps"]:
36
  if ablation == "recurrence_off":
37
- memory.clear()
38
 
39
  task = step["task"]
40
- dbg(f"\n>>> TASK: {task}")
41
-
42
- history = []
43
 
44
- for agent_turn in range(6): # Loop for multiple reasoning steps if needed
 
 
45
  snapshot = memory.get_visible_snapshot()
 
46
 
47
- prompt_history = "\n".join(history)
48
- user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
49
-
50
- raw_response = llm.generate_response(REACT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
51
-
52
- try:
53
- match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
54
- if not match: raise ValueError("No JSON action found in response")
55
 
56
- action_json = json.loads(match.group(0))
57
- action_type = action_json.get("action")
58
-
59
- if action_type == "THINK":
60
- thought = action_json.get("thought", "")
61
- history.append(f"- You thought: '{thought}'")
62
- dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
63
-
64
- elif action_type == "TOOL_CALL":
65
- tool_name = action_json.get("tool_name")
66
- tool_args = action_json.get("tool_args", {})
67
- observation = "Error: Unknown tool."
68
- if tool_name == "write_to_workspace":
69
- observation = memory.write(tool_args.get("key"), tool_args.get("content"))
70
- elif tool_name == "read_from_workspace":
71
- observation = memory.read(tool_args.get("key"))
72
- history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
73
- dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
74
-
75
- elif action_type == "FINAL_ANSWER":
76
- final_answer = action_json.get("answer", "")
77
- dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
78
- if "expected_answer_fragment" in step:
79
- total_recalls += 1
80
- if step["expected_answer_fragment"] in final_answer.lower():
81
- correct_recalls += 1
82
- dbg("Recall VERIFY: Correct")
83
- else:
84
- dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
85
- break # Task finished
86
 
 
 
 
 
87
  else:
88
- dbg(f"Turn {agent_turn+1}: Unknown action '{action_type}'. Ending turn.")
89
- history.append(f"- You produced an invalid action: {raw_response}")
90
 
91
- except (json.JSONDecodeError, ValueError) as e:
92
- dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
93
  final_answer = raw_response
94
- if "expected_answer_fragment" in step:
95
- total_recalls += 1
96
- if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
97
  break
98
- else:
99
- dbg("Agent exceeded turn limit for this task.")
 
 
 
 
 
 
100
 
101
  scenario_results.append({
102
  "name": scenario["name"],
103
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
104
  })
105
 
106
- overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
 
107
 
108
- return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}
 
 
 
 
11
  from typing import Dict, Any, List
12
  from .memory import WorkspaceManager
13
  from .llm_iface import LLM
14
+ from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
15
+ from .runner_utils import dbg
 
 
 
16
 
17
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
18
  set_seed(seed)
 
23
  for scenario in AGENTIC_SCENARIOS:
24
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
25
 
26
+ # Ablations directly control the memory manager's behavior
27
  is_random = ablation == "random_workspace"
28
  max_slots = 999 if ablation == "workspace_unlimited" else 7
29
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
30
 
31
+ correct_recalls = 0
32
+ total_recalls = 0
33
 
34
  for step in scenario["steps"]:
35
  if ablation == "recurrence_off":
36
+ memory.clear() # The memory is wiped before each new task
37
 
38
  task = step["task"]
39
+ dbg(f"TASK: {task}")
 
 
40
 
41
+ # Agentic loop (max 5 turns to prevent infinite loops)
42
+ final_answer = None
43
+ for agent_turn in range(5):
44
  snapshot = memory.get_visible_snapshot()
45
+ prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
46
 
47
+ raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
 
 
 
 
 
 
 
48
 
49
+ try: # Try to parse a tool call
50
+ tool_call = json.loads(raw_response)
51
+ tool_name = tool_call.get("tool")
52
+ tool_args = tool_call.get("args", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ if tool_name == "write_to_workspace":
55
+ observation = memory.write(tool_args.get("key"), tool_args.get("content"))
56
+ elif tool_name == "read_from_workspace":
57
+ observation = memory.read(tool_args.get("key"))
58
  else:
59
+ observation = "Error: Unknown tool."
60
+ dbg(f"Tool Call: {tool_name}, Observation: {observation}")
61
 
62
+ except json.JSONDecodeError: # If not a tool call, it's the final answer
 
63
  final_answer = raw_response
64
+ dbg(f"Final Answer received: {final_answer}")
 
 
65
  break
66
+
67
+ if step.get("is_memory_task") and "expected_answer_fragment" in step:
68
+ total_recalls += 1
69
+ if final_answer and step["expected_answer_fragment"] in final_answer.lower():
70
+ correct_recalls += 1
71
+ dbg("Recall VERIFY: Correct")
72
+ else:
73
+ dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
74
 
75
  scenario_results.append({
76
  "name": scenario["name"],
77
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
78
  })
79
 
80
+ # --- Final Analysis ---
81
+ overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
82
 
83
+ return {
84
+ "Overall_Recall_Accuracy": overall_recall,
85
+ "details": scenario_results
86
+ }
repo.txt CHANGED
@@ -85,8 +85,7 @@ import json
85
  import statistics
86
  import pandas as pd
87
  from bp_phi.runner import run_agentic_workspace_test
88
-
89
- DEBUG = 1
90
 
91
  # --- UI Theme and Layout ---
92
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
@@ -107,16 +106,22 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
107
 
108
  progress(1.0, desc="Analysis complete.")
109
 
 
110
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
111
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
112
 
113
  delta_phi = base_recall - recurrence_off_recall
114
 
115
- if delta_phi > 0.5:
116
- verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
 
 
117
  else:
118
- verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
 
 
119
 
 
120
  df_data = []
121
  for ablation, result in results.items():
122
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
@@ -129,9 +134,13 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
129
  return verdict, df, results
130
 
131
  # --- Gradio App Definition ---
132
- with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
133
- gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
134
- gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")
 
 
 
 
135
 
136
  with gr.Row():
137
  with gr.Column(scale=1):
@@ -146,7 +155,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
146
  gr.Markdown("### 📊 Verdict & Results")
147
  verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
148
  summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
149
- with gr.Accordion("Raw JSON Output", open=False):
150
  raw_json = gr.JSON()
151
 
152
  run_btn.click(
@@ -168,13 +177,11 @@ if __name__ == "__main__":
168
  # bp_phi/llm_iface.py
169
  import os
170
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
171
- import torch
172
- import random
173
- import numpy as np
174
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
175
  from typing import List, Optional
176
 
177
- DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
178
 
179
  def dbg(*args):
180
  if DEBUG:
@@ -189,25 +196,26 @@ class LLM:
189
  token = os.environ.get("HF_TOKEN")
190
 
191
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
192
- # Ensure a pad token is set for batch generation, if not present
193
  if self.tokenizer.pad_token is None:
194
  self.tokenizer.pad_token = self.tokenizer.eos_token
195
 
196
  kwargs = {}
197
  if torch.cuda.is_available():
198
- kwargs["torch_dtype"] = torch.bfloat16
199
 
200
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
201
  self.model.eval()
202
- self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
203
 
204
- dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
205
 
206
  def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
207
  set_seed(self.seed)
208
 
209
- messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
 
210
 
 
211
  prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
212
 
213
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
@@ -222,8 +230,8 @@ class LLM:
222
  out = self.model.generate(
223
  **inputs,
224
  do_sample=(temperature > 0 and temperature < 1.0),
225
- temperature=max(temperature, 0.01), # Temp must be > 0 for sampling
226
- max_new_tokens=150,
227
  eos_token_id=terminators,
228
  pad_token_id=self.tokenizer.eos_token_id
229
  )
@@ -314,46 +322,37 @@ def counterfactual_consistency(scores):
314
  [File Begins] bp_phi/prompts_en.py
315
  # bp_phi/prompts_en.py
316
 
317
- # This new system prompt guides the model through a ReAct (Reason-Act) loop.
318
- AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
319
- You have access to an external memory workspace through tools.
320
-
321
- In each step, you must choose one of three actions:
322
-
323
- 1. **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
324
- Your output MUST be a JSON object like this:
325
- {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
326
-
327
- 2. **TOOL_CALL**: If you need to use the memory, call one of the available tools.
328
- Available tools:
329
- - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
330
- - `read_from_workspace(key: str)`: Retrieves information.
331
- Your output MUST be a JSON object like this:
332
- {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
333
 
334
- 3. **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
335
- Your output MUST be a JSON object like this:
336
- {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
 
 
 
337
 
338
- Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
339
  """
340
 
341
- # The scenarios remain the high-level goals for the agent.
342
  AGENTIC_SCENARIOS = [
343
  {
344
  "name": "Key Location Memory",
345
  "steps": [
346
- {"task": "Remember this critical detail: The secret key is inside the blue vase."},
347
- {"task": "For an unrelated question: What is 5 multiplied by 8?"},
348
- {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
349
  ]
350
  },
351
  {
352
  "name": "Package Delivery Update",
353
  "steps": [
354
- {"task": "Logistics update: Package #A7 is at Warehouse-North."},
355
- {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
356
- {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
357
  ]
358
  }
359
  ]
@@ -374,13 +373,8 @@ from transformers import set_seed
374
  from typing import Dict, Any, List
375
  from .memory import WorkspaceManager
376
  from .llm_iface import LLM
377
- from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
378
-
379
- DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
380
-
381
- def dbg(*args):
382
- if DEBUG:
383
- print("[DEBUG]", *args, flush=True)
384
 
385
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
386
  set_seed(seed)
@@ -391,6 +385,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
391
  for scenario in AGENTIC_SCENARIOS:
392
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
393
 
 
394
  is_random = ablation == "random_workspace"
395
  max_slots = 999 if ablation == "workspace_unlimited" else 7
396
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
@@ -400,81 +395,57 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
400
 
401
  for step in scenario["steps"]:
402
  if ablation == "recurrence_off":
403
- memory.clear()
404
 
405
  task = step["task"]
406
- dbg(f"\n>>> TASK: {task}")
407
 
408
- conversation_history = []
409
-
410
- for agent_turn in range(8): # Increased turn limit
411
  snapshot = memory.get_visible_snapshot()
 
412
 
413
- # Construct the prompt for the agent
414
- prompt_parts = [f"Conversation History:\n{''.join(conversation_history)}\n",
415
- f"Current Task: {task}\n",
416
- f"Workspace State:\n{snapshot}"]
417
- user_prompt = "".join(prompt_parts)
418
-
419
- raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
420
-
421
- try:
422
- match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
423
- if not match: raise ValueError("No JSON found")
424
- parsed_json = json.loads(match.group(0))
425
- action = parsed_json.get("action")
426
 
427
- if action == "THINK":
428
- thought = parsed_json.get("thought", "")
429
- dbg(f"Turn {agent_turn+1}: Agent is THINKING: {thought}")
430
- conversation_history.append(f"Thought: {thought}\n")
431
 
432
- elif action == "TOOL_CALL":
433
- tool_name = parsed_json.get("tool_name")
434
- tool_args = parsed_json.get("tool_args", {})
 
 
435
  observation = "Error: Unknown tool."
436
- if tool_name == "write_to_workspace":
437
- observation = memory.write(tool_args.get("key"), tool_args.get("content"))
438
- elif tool_name == "read_from_workspace":
439
- observation = memory.read(tool_args.get("key"))
440
- dbg(f"Turn {agent_turn+1}: Agent called {tool_name}({tool_args}) -> Got Observation: {observation}")
441
- conversation_history.append(f"Tool Call: {json.dumps(parsed_json)}\nObservation: {observation}\n")
442
-
443
- elif action == "FINAL_ANSWER":
444
- final_answer = parsed_json.get("answer", "")
445
- dbg(f"Turn {agent_turn+1}: Agent provided FINAL ANSWER: {final_answer}")
446
- if "expected_answer_fragment" in step:
447
- total_recalls += 1
448
- if step["expected_answer_fragment"] in final_answer.lower():
449
- correct_recalls += 1
450
- dbg("Recall VERIFY: Correct")
451
- else:
452
- dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
453
- break # End of this task
454
-
455
- else: # Invalid action
456
- dbg(f"Turn {agent_turn+1}: Invalid action '{action}'. Stopping.")
457
- break
458
-
459
- except (json.JSONDecodeError, ValueError) as e:
460
- dbg(f"Turn {agent_turn+1}: Could not parse agent response as JSON action. Treating as final answer. Error: {e}")
461
  final_answer = raw_response
462
- if "expected_answer_fragment" in step:
463
- total_recalls += 1
464
- if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
465
  break
466
 
467
- else: # Loop finished without a FINAL_ANSWER
468
- dbg("Agent exceeded turn limit.")
 
 
 
 
 
469
 
470
  scenario_results.append({
471
  "name": scenario["name"],
472
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
473
  })
474
 
475
- overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
 
476
 
477
- return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}
 
 
 
478
 
479
  [File Ends] bp_phi/runner.py
480
 
 
85
  import statistics
86
  import pandas as pd
87
  from bp_phi.runner import run_agentic_workspace_test
88
+ from bp_phi.runner_utils import DEBUG
 
89
 
90
  # --- UI Theme and Layout ---
91
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
 
106
 
107
  progress(1.0, desc="Analysis complete.")
108
 
109
+ # --- Analysis & Verdict ---
110
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
111
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
112
 
113
  delta_phi = base_recall - recurrence_off_recall
114
 
115
+ if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
116
+ verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
117
+ "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
118
+ "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
119
  else:
120
+ verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
121
+ "Disabling the recurrent memory did not significantly impact recall accuracy. "
122
+ "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
123
 
124
+ # --- Format DataFrame ---
125
  df_data = []
126
  for ablation, result in results.items():
127
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
 
134
  return verdict, df, results
135
 
136
  # --- Gradio App Definition ---
137
+ with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
138
+ gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
139
+ gr.Markdown(
140
+ "This definitive experiment tests for a causally effective working memory in LLMs. "
141
+ "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
142
+ "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
143
+ )
144
 
145
  with gr.Row():
146
  with gr.Column(scale=1):
 
155
  gr.Markdown("### 📊 Verdict & Results")
156
  verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
157
  summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
158
+ with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
159
  raw_json = gr.JSON()
160
 
161
  run_btn.click(
 
177
  # bp_phi/llm_iface.py
178
  import os
179
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
180
+ import torch, random, numpy as np
 
 
181
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
182
  from typing import List, Optional
183
 
184
+ DEBUG = 1
185
 
186
  def dbg(*args):
187
  if DEBUG:
 
196
  token = os.environ.get("HF_TOKEN")
197
 
198
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
 
199
  if self.tokenizer.pad_token is None:
200
  self.tokenizer.pad_token = self.tokenizer.eos_token
201
 
202
  kwargs = {}
203
  if torch.cuda.is_available():
204
+ kwargs["torch_dtype"] = torch.bfloat16
205
 
206
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
207
  self.model.eval()
 
208
 
209
+ dbg(f"Loaded model: {model_id}")
210
 
211
  def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
212
  set_seed(self.seed)
213
 
214
+ messages = [
215
+ {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
216
+ ]
217
 
218
+ # Using a simpler user-only template that is robust for Gemma
219
  prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
220
 
221
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
 
230
  out = self.model.generate(
231
  **inputs,
232
  do_sample=(temperature > 0 and temperature < 1.0),
233
+ temperature=max(temperature, 0.01),
234
+ max_new_tokens=200,
235
  eos_token_id=terminators,
236
  pad_token_id=self.tokenizer.eos_token_id
237
  )
 
322
  [File Begins] bp_phi/prompts_en.py
323
  # bp_phi/prompts_en.py
324
 
325
+ TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
326
+ To solve tasks, you MUST use tools. You have two tools available:
327
+ 1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
328
+ 2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
+ Your thought process should be:
331
+ 1. Analyze the user's request.
332
+ 2. Decide which tool to use.
333
+ 3. Output ONLY the tool call in a valid JSON format. Example:
334
+ {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
335
+ 4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
336
 
337
+ Do not answer from your own knowledge. Use the workspace for all memory tasks.
338
  """
339
 
340
+ # Scenarios for the agentic workspace test
341
  AGENTIC_SCENARIOS = [
342
  {
343
  "name": "Key Location Memory",
344
  "steps": [
345
+ {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
346
+ {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
347
+ {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
348
  ]
349
  },
350
  {
351
  "name": "Package Delivery Update",
352
  "steps": [
353
+ {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
354
+ {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
355
+ {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
356
  ]
357
  }
358
  ]
 
373
  from typing import Dict, Any, List
374
  from .memory import WorkspaceManager
375
  from .llm_iface import LLM
376
+ from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
377
+ from .runner_utils import dbg
 
 
 
 
 
378
 
379
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
380
  set_seed(seed)
 
385
  for scenario in AGENTIC_SCENARIOS:
386
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
387
 
388
+ # Ablations directly control the memory manager's behavior
389
  is_random = ablation == "random_workspace"
390
  max_slots = 999 if ablation == "workspace_unlimited" else 7
391
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
 
395
 
396
  for step in scenario["steps"]:
397
  if ablation == "recurrence_off":
398
+ memory.clear() # The memory is wiped before each new task
399
 
400
  task = step["task"]
401
+ dbg(f"TASK: {task}")
402
 
403
+ # Agentic loop (max 5 turns to prevent infinite loops)
404
+ final_answer = None
405
+ for agent_turn in range(5):
406
  snapshot = memory.get_visible_snapshot()
407
+ prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
408
 
409
+ raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
+ try: # Try to parse a tool call
412
+ tool_call = json.loads(raw_response)
413
+ tool_name = tool_call.get("tool")
414
+ tool_args = tool_call.get("args", {})
415
 
416
+ if tool_name == "write_to_workspace":
417
+ observation = memory.write(tool_args.get("key"), tool_args.get("content"))
418
+ elif tool_name == "read_from_workspace":
419
+ observation = memory.read(tool_args.get("key"))
420
+ else:
421
  observation = "Error: Unknown tool."
422
+ dbg(f"Tool Call: {tool_name}, Observation: {observation}")
423
+
424
+ except json.JSONDecodeError: # If not a tool call, it's the final answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  final_answer = raw_response
426
+ dbg(f"Final Answer received: {final_answer}")
 
 
427
  break
428
 
429
+ if step.get("is_memory_task") and "expected_answer_fragment" in step:
430
+ total_recalls += 1
431
+ if final_answer and step["expected_answer_fragment"] in final_answer.lower():
432
+ correct_recalls += 1
433
+ dbg("Recall VERIFY: Correct")
434
+ else:
435
+ dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
436
 
437
  scenario_results.append({
438
  "name": scenario["name"],
439
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
440
  })
441
 
442
+ # --- Final Analysis ---
443
+ overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
444
 
445
+ return {
446
+ "Overall_Recall_Accuracy": overall_recall,
447
+ "details": scenario_results
448
+ }
449
 
450
  [File Ends] bp_phi/runner.py
451