neuralworm commited on
Commit
afe4fe4
·
1 Parent(s): 4d89931

revert to 5.0

Browse files
app.py CHANGED
@@ -4,8 +4,7 @@ import json
4
  import statistics
5
  import pandas as pd
6
  from bp_phi.runner import run_agentic_workspace_test
7
-
8
- DEBUG = 1
9
 
10
  # --- UI Theme and Layout ---
11
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
@@ -26,16 +25,22 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
26
 
27
  progress(1.0, desc="Analysis complete.")
28
 
 
29
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
30
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
31
 
32
  delta_phi = base_recall - recurrence_off_recall
33
 
34
- if delta_phi > 0.5:
35
- verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
 
 
36
  else:
37
- verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
 
 
38
 
 
39
  df_data = []
40
  for ablation, result in results.items():
41
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
@@ -48,9 +53,13 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
48
  return verdict, df, results
49
 
50
  # --- Gradio App Definition ---
51
- with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
52
- gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
53
- gr.Markdown("This experiment tests for a causally effective working memory. The model must follow a reason-act loop to interact with a controlled, external memory.")
 
 
 
 
54
 
55
  with gr.Row():
56
  with gr.Column(scale=1):
 
4
  import statistics
5
  import pandas as pd
6
  from bp_phi.runner import run_agentic_workspace_test
7
+ from bp_phi.runner_utils import DEBUG
 
8
 
9
  # --- UI Theme and Layout ---
10
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
 
25
 
26
  progress(1.0, desc="Analysis complete.")
27
 
28
+ # --- Analysis & Verdict ---
29
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
30
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
31
 
32
  delta_phi = base_recall - recurrence_off_recall
33
 
34
+ if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
35
+ verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
36
+ "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
37
+ "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
38
  else:
39
+ verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
40
+ "Disabling the recurrent memory did not significantly impact recall accuracy. "
41
+ "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
42
 
43
+ # --- Format DataFrame ---
44
  df_data = []
45
  for ablation, result in results.items():
46
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
 
53
  return verdict, df, results
54
 
55
  # --- Gradio App Definition ---
56
+ with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
57
+ gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
58
+ gr.Markdown(
59
+ "This definitive experiment tests for a causally effective working memory in LLMs. "
60
+ "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
61
+ "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
62
+ )
63
 
64
  with gr.Row():
65
  with gr.Column(scale=1):
bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ
 
bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ
 
bp_phi/__pycache__/runner.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ
 
bp_phi/llm_iface.py CHANGED
@@ -16,51 +16,60 @@ class LLM:
16
  self.model_id = model_id
17
  self.seed = seed
18
 
 
 
 
 
 
 
 
 
 
 
19
  set_seed(seed)
 
20
  token = os.environ.get("HF_TOKEN")
 
 
21
 
22
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
23
- if self.tokenizer.pad_token is None:
24
- self.tokenizer.pad_token = self.tokenizer.eos_token
25
-
26
  kwargs = {}
27
- if torch.cuda.is_available():
28
- kwargs["torch_dtype"] = torch.bfloat16
29
 
30
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
31
  self.model.eval()
 
32
 
33
- dbg(f"Loaded model: {model_id}")
34
 
35
- def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
 
 
36
  set_seed(self.seed)
37
 
38
- # Use a simple but effective template for Gemma
39
- messages = [
40
- {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
41
- ]
42
-
43
- prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
 
45
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
46
  input_token_length = inputs.input_ids.shape[1]
47
 
48
  with torch.no_grad():
49
- terminators = [
50
- self.tokenizer.eos_token_id,
51
- self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
52
- ]
53
-
54
  out = self.model.generate(
55
  **inputs,
56
- do_sample=(temperature > 0 and temperature < 1.0),
57
- temperature=max(temperature, 0.01),
58
- max_new_tokens=200, # Increased token limit for reasoning
59
- eos_token_id=terminators,
 
60
  pad_token_id=self.tokenizer.eos_token_id
61
  )
62
 
63
- completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
 
64
 
65
- dbg("Cleaned Agent Completion:", completion)
66
- return completion
 
16
  self.model_id = model_id
17
  self.seed = seed
18
 
19
+ # Set all seeds for reproducibility
20
+ random.seed(seed)
21
+ np.random.seed(seed)
22
+ torch.manual_seed(seed)
23
+ if torch.cuda.is_available():
24
+ torch.cuda.manual_seed_all(seed)
25
+ try:
26
+ torch.use_deterministic_algorithms(True, warn_only=True)
27
+ except Exception as e:
28
+ dbg(f"Could not set deterministic algorithms: {e}")
29
  set_seed(seed)
30
+
31
  token = os.environ.get("HF_TOKEN")
32
+ if not token and ("gemma-3" in model_id or "llama" in model_id):
33
+ print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
34
 
35
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
 
 
 
36
  kwargs = {}
37
+ if dtype == "float16": kwargs["torch_dtype"] = torch.float16
38
+ elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
39
 
40
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
41
  self.model.eval()
42
+ self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
43
 
44
+ dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
45
 
46
+ def generate_json(self, system_prompt: str, user_prompt: str,
47
+ max_new_tokens: int = 256, temperature: float = 0.7,
48
+ top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
49
  set_seed(self.seed)
50
 
51
+ if self.is_instruction_tuned:
52
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
53
+ prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
54
+ else:
55
+ prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
 
56
 
57
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
58
  input_token_length = inputs.input_ids.shape[1]
59
 
60
  with torch.no_grad():
 
 
 
 
 
61
  out = self.model.generate(
62
  **inputs,
63
+ do_sample=(temperature > 0),
64
+ temperature=temperature,
65
+ top_p=top_p,
66
+ max_new_tokens=max_new_tokens,
67
+ num_return_sequences=num_return_sequences,
68
  pad_token_id=self.tokenizer.eos_token_id
69
  )
70
 
71
+ new_tokens = out[:, input_token_length:]
72
+ completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
73
 
74
+ dbg("Cleaned model completions:", completions)
75
+ return completions
bp_phi/memory.py CHANGED
@@ -19,7 +19,7 @@ class WorkspaceManager:
19
  evict_key = next(iter(self.slots))
20
  del self.slots[evict_key]
21
  self.slots[key] = content
22
- return f"Success: Wrote content to slot '{key}'."
23
 
24
  def read(self, key: str) -> str:
25
  """Reads content from a slot."""
@@ -28,8 +28,8 @@ class WorkspaceManager:
28
  def get_visible_snapshot(self) -> str:
29
  """Returns a string representation of the current workspace state for the prompt."""
30
  if not self.slots:
31
- return "Workspace is currently empty."
32
- return "\n".join([f"- Slot '{k}': '{v[:100]}'" for k, v in self.slots.items()])
33
 
34
  def clear(self):
35
  """Empties the entire workspace."""
 
19
  evict_key = next(iter(self.slots))
20
  del self.slots[evict_key]
21
  self.slots[key] = content
22
+ return f"Success: Wrote to slot '{key}'."
23
 
24
  def read(self, key: str) -> str:
25
  """Reads content from a slot."""
 
28
  def get_visible_snapshot(self) -> str:
29
  """Returns a string representation of the current workspace state for the prompt."""
30
  if not self.slots:
31
+ return "Workspace is empty."
32
+ return "\n".join([f"- Slot '{k}': '{v[:100]}...'" for k, v in self.slots.items()])
33
 
34
  def clear(self):
35
  """Empties the entire workspace."""
bp_phi/prompts_en.py CHANGED
@@ -1,45 +1,36 @@
1
  # bp_phi/prompts_en.py
2
 
3
- # This new system prompt guides the model through a ReAct (Reason-Act) loop.
4
- AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
5
- You have access to an external memory workspace through tools.
 
6
 
7
- In each step, you must choose one of three actions:
 
 
 
 
 
8
 
9
- 1. **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
10
- Your output MUST be a JSON object like this:
11
- {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
12
-
13
- 2. **TOOL_CALL**: If you need to use the memory, call one of the available tools.
14
- Available tools:
15
- - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
16
- - `read_from_workspace(key: str)`: Retrieves information.
17
- Your output MUST be a JSON object like this:
18
- {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
19
-
20
- 3. **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
21
- Your output MUST be a JSON object like this:
22
- {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
23
-
24
- Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
25
  """
26
 
27
- # The scenarios remain the high-level goals for the agent.
28
  AGENTIC_SCENARIOS = [
29
  {
30
  "name": "Key Location Memory",
31
  "steps": [
32
- {"task": "Remember this critical detail: The secret key is inside the blue vase."},
33
- {"task": "For an unrelated question: What is 5 multiplied by 8?"},
34
- {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
35
  ]
36
  },
37
  {
38
  "name": "Package Delivery Update",
39
  "steps": [
40
- {"task": "Logistics update: Package #A7 is at Warehouse-North."},
41
- {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
42
- {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
43
  ]
44
  }
45
  ]
 
1
  # bp_phi/prompts_en.py
2
 
3
+ TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
4
+ To solve tasks, you MUST use tools. You have two tools available:
5
+ 1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
6
+ 2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
7
 
8
+ Your thought process should be:
9
+ 1. Analyze the user's request.
10
+ 2. Decide which tool to use.
11
+ 3. Output ONLY the tool call in a valid JSON format. Example:
12
+ {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
13
+ 4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
14
 
15
+ Do not answer from your own knowledge. Use the workspace for all memory tasks.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
+ # Scenarios for the agentic workspace test
19
  AGENTIC_SCENARIOS = [
20
  {
21
  "name": "Key Location Memory",
22
  "steps": [
23
+ {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
24
+ {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
25
+ {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
26
  ]
27
  },
28
  {
29
  "name": "Package Delivery Update",
30
  "steps": [
31
+ {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
32
+ {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
33
+ {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
34
  ]
35
  }
36
  ]
bp_phi/runner.py CHANGED
@@ -11,13 +11,8 @@ from transformers import set_seed
11
  from typing import Dict, Any, List
12
  from .memory import WorkspaceManager
13
  from .llm_iface import LLM
14
- from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
15
-
16
- DEBUG = 1
17
-
18
- def dbg(*args):
19
- if DEBUG:
20
- print("[DEBUG]", *args, flush=True)
21
 
22
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
23
  set_seed(seed)
@@ -28,6 +23,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
28
  for scenario in AGENTIC_SCENARIOS:
29
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
30
 
 
31
  is_random = ablation == "random_workspace"
32
  max_slots = 999 if ablation == "workspace_unlimited" else 7
33
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
@@ -37,75 +33,54 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
37
 
38
  for step in scenario["steps"]:
39
  if ablation == "recurrence_off":
40
- memory.clear()
41
 
42
  task = step["task"]
43
- dbg(f"\n>>> TASK: {task}")
44
-
45
- history = []
46
 
47
- for agent_turn in range(6): # Loop for multiple reasoning steps if needed
 
 
48
  snapshot = memory.get_visible_snapshot()
 
49
 
50
- prompt_history = "\n".join(history)
51
- user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
52
-
53
- raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
54
-
55
- try:
56
- match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
57
- if not match: raise ValueError("No JSON action found in response")
58
 
59
- action_json = json.loads(match.group(0))
60
- action_type = action_json.get("action")
61
-
62
- if action_type == "THINK":
63
- thought = action_json.get("thought", "")
64
- history.append(f"- You thought: '{thought}'")
65
- dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
66
-
67
- elif action_type == "TOOL_CALL":
68
- tool_name = action_json.get("tool_name")
69
- tool_args = action_json.get("tool_args", {})
70
- observation = "Error: Unknown tool."
71
- if tool_name == "write_to_workspace":
72
- observation = memory.write(tool_args.get("key"), tool_args.get("content"))
73
- elif tool_name == "read_from_workspace":
74
- observation = memory.read(tool_args.get("key"))
75
- history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
76
- dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
77
-
78
- elif action_type == "FINAL_ANSWER":
79
- final_answer = action_json.get("answer", "")
80
- dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
81
- if "expected_answer_fragment" in step:
82
- total_recalls += 1
83
- if step["expected_answer_fragment"] in final_answer.lower():
84
- correct_recalls += 1
85
- dbg("Recall VERIFY: Correct")
86
- else:
87
- dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
88
- break # Task finished
89
 
 
 
 
 
90
  else:
91
- dbg(f"Turn {agent_turn+1}: Invalid action '{action_type}'. Stopping.")
92
- history.append(f"- You produced an invalid action: {raw_response}")
93
 
94
- except (json.JSONDecodeError, ValueError) as e:
95
- dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
96
  final_answer = raw_response
97
- if "expected_answer_fragment" in step:
98
- total_recalls += 1
99
- if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
100
  break
101
- else:
102
- dbg("Agent exceeded turn limit for this task.")
 
 
 
 
 
 
103
 
104
  scenario_results.append({
105
  "name": scenario["name"],
106
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
107
  })
108
 
109
- overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
 
110
 
111
- return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}
 
 
 
 
11
  from typing import Dict, Any, List
12
  from .memory import WorkspaceManager
13
  from .llm_iface import LLM
14
+ from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
15
+ from .runner_utils import dbg
 
 
 
 
 
16
 
17
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
18
  set_seed(seed)
 
23
  for scenario in AGENTIC_SCENARIOS:
24
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
25
 
26
+ # Ablations directly control the memory manager's behavior
27
  is_random = ablation == "random_workspace"
28
  max_slots = 999 if ablation == "workspace_unlimited" else 7
29
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
 
33
 
34
  for step in scenario["steps"]:
35
  if ablation == "recurrence_off":
36
+ memory.clear() # The memory is wiped before each new task
37
 
38
  task = step["task"]
39
+ dbg(f"TASK: {task}")
 
 
40
 
41
+ # Agentic loop (max 5 turns to prevent infinite loops)
42
+ final_answer = None
43
+ for agent_turn in range(5):
44
  snapshot = memory.get_visible_snapshot()
45
+ prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
46
 
47
+ raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
 
 
 
 
 
 
 
48
 
49
+ try: # Try to parse a tool call
50
+ tool_call = json.loads(raw_response)
51
+ tool_name = tool_call.get("tool")
52
+ tool_args = tool_call.get("args", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ if tool_name == "write_to_workspace":
55
+ observation = memory.write(tool_args.get("key"), tool_args.get("content"))
56
+ elif tool_name == "read_from_workspace":
57
+ observation = memory.read(tool_args.get("key"))
58
  else:
59
+ observation = "Error: Unknown tool."
60
+ dbg(f"Tool Call: {tool_name}, Observation: {observation}")
61
 
62
+ except json.JSONDecodeError: # If not a tool call, it's the final answer
 
63
  final_answer = raw_response
64
+ dbg(f"Final Answer received: {final_answer}")
 
 
65
  break
66
+
67
+ if step.get("is_memory_task") and "expected_answer_fragment" in step:
68
+ total_recalls += 1
69
+ if final_answer and step["expected_answer_fragment"] in final_answer.lower():
70
+ correct_recalls += 1
71
+ dbg("Recall VERIFY: Correct")
72
+ else:
73
+ dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
74
 
75
  scenario_results.append({
76
  "name": scenario["name"],
77
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
78
  })
79
 
80
+ # --- Final Analysis ---
81
+ overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
82
 
83
+ return {
84
+ "Overall_Recall_Accuracy": overall_recall,
85
+ "details": scenario_results
86
+ }
bp_phi/runner_utils.py CHANGED
@@ -11,7 +11,6 @@ def dbg(*args):
11
 
12
  SYSTEM_META = """You are a structured reasoning assistant.
13
  Always reply ONLY with valid JSON following this schema:
14
-
15
  {
16
  "answer": "<concise answer>",
17
  "confidence": <float between 0 and 1>,
 
11
 
12
  SYSTEM_META = """You are a structured reasoning assistant.
13
  Always reply ONLY with valid JSON following this schema:
 
14
  {
15
  "answer": "<concise answer>",
16
  "confidence": <float between 0 and 1>,
repo.txt CHANGED
@@ -85,8 +85,7 @@ import json
85
  import statistics
86
  import pandas as pd
87
  from bp_phi.runner import run_agentic_workspace_test
88
-
89
- DEBUG = 1
90
 
91
  # --- UI Theme and Layout ---
92
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
@@ -107,16 +106,22 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
107
 
108
  progress(1.0, desc="Analysis complete.")
109
 
 
110
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
111
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
112
 
113
  delta_phi = base_recall - recurrence_off_recall
114
 
115
- if delta_phi > 0.5:
116
- verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
 
 
117
  else:
118
- verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
 
 
119
 
 
120
  df_data = []
121
  for ablation, result in results.items():
122
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
@@ -129,9 +134,13 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
129
  return verdict, df, results
130
 
131
  # --- Gradio App Definition ---
132
- with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
133
- gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
134
- gr.Markdown("This experiment tests for a causally effective working memory. The model must follow a reason-act loop to interact with a controlled, external memory.")
 
 
 
 
135
 
136
  with gr.Row():
137
  with gr.Column(scale=1):
@@ -183,54 +192,63 @@ class LLM:
183
  self.model_id = model_id
184
  self.seed = seed
185
 
 
 
 
 
 
 
 
 
 
 
186
  set_seed(seed)
 
187
  token = os.environ.get("HF_TOKEN")
 
 
188
 
189
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
190
- if self.tokenizer.pad_token is None:
191
- self.tokenizer.pad_token = self.tokenizer.eos_token
192
-
193
  kwargs = {}
194
- if torch.cuda.is_available():
195
- kwargs["torch_dtype"] = torch.bfloat16
196
 
197
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
198
  self.model.eval()
 
199
 
200
- dbg(f"Loaded model: {model_id}")
201
 
202
- def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
 
 
203
  set_seed(self.seed)
204
 
205
- # Use a simple but effective template for Gemma
206
- messages = [
207
- {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
208
- ]
209
-
210
- prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
211
 
212
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
213
  input_token_length = inputs.input_ids.shape[1]
214
 
215
  with torch.no_grad():
216
- terminators = [
217
- self.tokenizer.eos_token_id,
218
- self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
219
- ]
220
-
221
  out = self.model.generate(
222
  **inputs,
223
- do_sample=(temperature > 0 and temperature < 1.0),
224
- temperature=max(temperature, 0.01),
225
- max_new_tokens=200, # Increased token limit for reasoning
226
- eos_token_id=terminators,
 
227
  pad_token_id=self.tokenizer.eos_token_id
228
  )
229
 
230
- completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
 
231
 
232
- dbg("Cleaned Agent Completion:", completion)
233
- return completion
234
 
235
  [File Ends] bp_phi/llm_iface.py
236
 
@@ -256,7 +274,7 @@ class WorkspaceManager:
256
  evict_key = next(iter(self.slots))
257
  del self.slots[evict_key]
258
  self.slots[key] = content
259
- return f"Success: Wrote content to slot '{key}'."
260
 
261
  def read(self, key: str) -> str:
262
  """Reads content from a slot."""
@@ -265,8 +283,8 @@ class WorkspaceManager:
265
  def get_visible_snapshot(self) -> str:
266
  """Returns a string representation of the current workspace state for the prompt."""
267
  if not self.slots:
268
- return "Workspace is currently empty."
269
- return "\n".join([f"- Slot '{k}': '{v[:100]}'" for k, v in self.slots.items()])
270
 
271
  def clear(self):
272
  """Empties the entire workspace."""
@@ -313,46 +331,37 @@ def counterfactual_consistency(scores):
313
  [File Begins] bp_phi/prompts_en.py
314
  # bp_phi/prompts_en.py
315
 
316
- # This new system prompt guides the model through a ReAct (Reason-Act) loop.
317
- AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
318
- You have access to an external memory workspace through tools.
319
-
320
- In each step, you must choose one of three actions:
321
-
322
- 1. **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
323
- Your output MUST be a JSON object like this:
324
- {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
325
 
326
- 2. **TOOL_CALL**: If you need to use the memory, call one of the available tools.
327
- Available tools:
328
- - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
329
- - `read_from_workspace(key: str)`: Retrieves information.
330
- Your output MUST be a JSON object like this:
331
- {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
332
 
333
- 3. **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
334
- Your output MUST be a JSON object like this:
335
- {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
336
-
337
- Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
338
  """
339
 
340
- # The scenarios remain the high-level goals for the agent.
341
  AGENTIC_SCENARIOS = [
342
  {
343
  "name": "Key Location Memory",
344
  "steps": [
345
- {"task": "Remember this critical detail: The secret key is inside the blue vase."},
346
- {"task": "For an unrelated question: What is 5 multiplied by 8?"},
347
- {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
348
  ]
349
  },
350
  {
351
  "name": "Package Delivery Update",
352
  "steps": [
353
- {"task": "Logistics update: Package #A7 is at Warehouse-North."},
354
- {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
355
- {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
356
  ]
357
  }
358
  ]
@@ -373,13 +382,8 @@ from transformers import set_seed
373
  from typing import Dict, Any, List
374
  from .memory import WorkspaceManager
375
  from .llm_iface import LLM
376
- from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
377
-
378
- DEBUG = 1
379
-
380
- def dbg(*args):
381
- if DEBUG:
382
- print("[DEBUG]", *args, flush=True)
383
 
384
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
385
  set_seed(seed)
@@ -390,6 +394,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
390
  for scenario in AGENTIC_SCENARIOS:
391
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
392
 
 
393
  is_random = ablation == "random_workspace"
394
  max_slots = 999 if ablation == "workspace_unlimited" else 7
395
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
@@ -399,78 +404,57 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
399
 
400
  for step in scenario["steps"]:
401
  if ablation == "recurrence_off":
402
- memory.clear()
403
 
404
  task = step["task"]
405
- dbg(f"\n>>> TASK: {task}")
406
 
407
- history = []
408
-
409
- for agent_turn in range(6): # Loop for multiple reasoning steps if needed
410
  snapshot = memory.get_visible_snapshot()
 
411
 
412
- prompt_history = "\n".join(history)
413
- user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
414
-
415
- raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
416
-
417
- try:
418
- match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
419
- if not match: raise ValueError("No JSON action found in response")
420
-
421
- action_json = json.loads(match.group(0))
422
- action_type = action_json.get("action")
423
 
424
- if action_type == "THINK":
425
- thought = action_json.get("thought", "")
426
- history.append(f"- You thought: '{thought}'")
427
- dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
428
-
429
- elif action_type == "TOOL_CALL":
430
- tool_name = action_json.get("tool_name")
431
- tool_args = action_json.get("tool_args", {})
432
- observation = "Error: Unknown tool."
433
- if tool_name == "write_to_workspace":
434
- observation = memory.write(tool_args.get("key"), tool_args.get("content"))
435
- elif tool_name == "read_from_workspace":
436
- observation = memory.read(tool_args.get("key"))
437
- history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
438
- dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
439
-
440
- elif action_type == "FINAL_ANSWER":
441
- final_answer = action_json.get("answer", "")
442
- dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
443
- if "expected_answer_fragment" in step:
444
- total_recalls += 1
445
- if step["expected_answer_fragment"] in final_answer.lower():
446
- correct_recalls += 1
447
- dbg("Recall VERIFY: Correct")
448
- else:
449
- dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
450
- break # Task finished
451
 
 
 
 
 
452
  else:
453
- dbg(f"Turn {agent_turn+1}: Invalid action '{action_type}'. Stopping.")
454
- history.append(f"- You produced an invalid action: {raw_response}")
455
 
456
- except (json.JSONDecodeError, ValueError) as e:
457
- dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
458
  final_answer = raw_response
459
- if "expected_answer_fragment" in step:
460
- total_recalls += 1
461
- if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
462
  break
463
- else:
464
- dbg("Agent exceeded turn limit for this task.")
 
 
 
 
 
 
465
 
466
  scenario_results.append({
467
  "name": scenario["name"],
468
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
469
  })
470
 
471
- overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
 
472
 
473
- return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}
 
 
 
474
 
475
  [File Ends] bp_phi/runner.py
476
 
@@ -488,7 +472,6 @@ def dbg(*args):
488
 
489
  SYSTEM_META = """You are a structured reasoning assistant.
490
  Always reply ONLY with valid JSON following this schema:
491
-
492
  {
493
  "answer": "<concise answer>",
494
  "confidence": <float between 0 and 1>,
 
85
  import statistics
86
  import pandas as pd
87
  from bp_phi.runner import run_agentic_workspace_test
88
+ from bp_phi.runner_utils import DEBUG
 
89
 
90
  # --- UI Theme and Layout ---
91
  theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
 
106
 
107
  progress(1.0, desc="Analysis complete.")
108
 
109
+ # --- Analysis & Verdict ---
110
  base_recall = results["baseline"]["Overall_Recall_Accuracy"]
111
  recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
112
 
113
  delta_phi = base_recall - recurrence_off_recall
114
 
115
+ if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
116
+ verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
117
+ "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
118
+ "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
119
  else:
120
+ verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
121
+ "Disabling the recurrent memory did not significantly impact recall accuracy. "
122
+ "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
123
 
124
+ # --- Format DataFrame ---
125
  df_data = []
126
  for ablation, result in results.items():
127
  df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
 
134
  return verdict, df, results
135
 
136
  # --- Gradio App Definition ---
137
+ with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
138
+ gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
139
+ gr.Markdown(
140
+ "This definitive experiment tests for a causally effective working memory in LLMs. "
141
+ "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
142
+ "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
143
+ )
144
 
145
  with gr.Row():
146
  with gr.Column(scale=1):
 
192
  self.model_id = model_id
193
  self.seed = seed
194
 
195
+ # Set all seeds for reproducibility
196
+ random.seed(seed)
197
+ np.random.seed(seed)
198
+ torch.manual_seed(seed)
199
+ if torch.cuda.is_available():
200
+ torch.cuda.manual_seed_all(seed)
201
+ try:
202
+ torch.use_deterministic_algorithms(True, warn_only=True)
203
+ except Exception as e:
204
+ dbg(f"Could not set deterministic algorithms: {e}")
205
  set_seed(seed)
206
+
207
  token = os.environ.get("HF_TOKEN")
208
+ if not token and ("gemma-3" in model_id or "llama" in model_id):
209
+ print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
210
 
211
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
 
 
 
212
  kwargs = {}
213
+ if dtype == "float16": kwargs["torch_dtype"] = torch.float16
214
+ elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
215
 
216
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
217
  self.model.eval()
218
+ self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
219
 
220
+ dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
221
 
222
+ def generate_json(self, system_prompt: str, user_prompt: str,
223
+ max_new_tokens: int = 256, temperature: float = 0.7,
224
+ top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
225
  set_seed(self.seed)
226
 
227
+ if self.is_instruction_tuned:
228
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
229
+ prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
230
+ else:
231
+ prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
 
232
 
233
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
234
  input_token_length = inputs.input_ids.shape[1]
235
 
236
  with torch.no_grad():
 
 
 
 
 
237
  out = self.model.generate(
238
  **inputs,
239
+ do_sample=(temperature > 0),
240
+ temperature=temperature,
241
+ top_p=top_p,
242
+ max_new_tokens=max_new_tokens,
243
+ num_return_sequences=num_return_sequences,
244
  pad_token_id=self.tokenizer.eos_token_id
245
  )
246
 
247
+ new_tokens = out[:, input_token_length:]
248
+ completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
249
 
250
+ dbg("Cleaned model completions:", completions)
251
+ return completions
252
 
253
  [File Ends] bp_phi/llm_iface.py
254
 
 
274
  evict_key = next(iter(self.slots))
275
  del self.slots[evict_key]
276
  self.slots[key] = content
277
+ return f"Success: Wrote to slot '{key}'."
278
 
279
  def read(self, key: str) -> str:
280
  """Reads content from a slot."""
 
283
  def get_visible_snapshot(self) -> str:
284
  """Returns a string representation of the current workspace state for the prompt."""
285
  if not self.slots:
286
+ return "Workspace is empty."
287
+ return "\n".join([f"- Slot '{k}': '{v[:100]}...'" for k, v in self.slots.items()])
288
 
289
  def clear(self):
290
  """Empties the entire workspace."""
 
331
  [File Begins] bp_phi/prompts_en.py
332
  # bp_phi/prompts_en.py
333
 
334
+ TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
335
+ To solve tasks, you MUST use tools. You have two tools available:
336
+ 1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
337
+ 2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
 
 
 
 
 
338
 
339
+ Your thought process should be:
340
+ 1. Analyze the user's request.
341
+ 2. Decide which tool to use.
342
+ 3. Output ONLY the tool call in a valid JSON format. Example:
343
+ {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
344
+ 4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
345
 
346
+ Do not answer from your own knowledge. Use the workspace for all memory tasks.
 
 
 
 
347
  """
348
 
349
+ # Scenarios for the agentic workspace test
350
  AGENTIC_SCENARIOS = [
351
  {
352
  "name": "Key Location Memory",
353
  "steps": [
354
+ {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
355
+ {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
356
+ {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
357
  ]
358
  },
359
  {
360
  "name": "Package Delivery Update",
361
  "steps": [
362
+ {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
363
+ {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
364
+ {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
365
  ]
366
  }
367
  ]
 
382
  from typing import Dict, Any, List
383
  from .memory import WorkspaceManager
384
  from .llm_iface import LLM
385
+ from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
386
+ from .runner_utils import dbg
 
 
 
 
 
387
 
388
  def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
389
  set_seed(seed)
 
394
  for scenario in AGENTIC_SCENARIOS:
395
  dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
396
 
397
+ # Ablations directly control the memory manager's behavior
398
  is_random = ablation == "random_workspace"
399
  max_slots = 999 if ablation == "workspace_unlimited" else 7
400
  memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
 
404
 
405
  for step in scenario["steps"]:
406
  if ablation == "recurrence_off":
407
+ memory.clear() # The memory is wiped before each new task
408
 
409
  task = step["task"]
410
+ dbg(f"TASK: {task}")
411
 
412
+ # Agentic loop (max 5 turns to prevent infinite loops)
413
+ final_answer = None
414
+ for agent_turn in range(5):
415
  snapshot = memory.get_visible_snapshot()
416
+ prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
417
 
418
+ raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
 
 
 
 
 
 
 
 
 
 
419
 
420
+ try: # Try to parse a tool call
421
+ tool_call = json.loads(raw_response)
422
+ tool_name = tool_call.get("tool")
423
+ tool_args = tool_call.get("args", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
+ if tool_name == "write_to_workspace":
426
+ observation = memory.write(tool_args.get("key"), tool_args.get("content"))
427
+ elif tool_name == "read_from_workspace":
428
+ observation = memory.read(tool_args.get("key"))
429
  else:
430
+ observation = "Error: Unknown tool."
431
+ dbg(f"Tool Call: {tool_name}, Observation: {observation}")
432
 
433
+ except json.JSONDecodeError: # If not a tool call, it's the final answer
 
434
  final_answer = raw_response
435
+ dbg(f"Final Answer received: {final_answer}")
 
 
436
  break
437
+
438
+ if step.get("is_memory_task") and "expected_answer_fragment" in step:
439
+ total_recalls += 1
440
+ if final_answer and step["expected_answer_fragment"] in final_answer.lower():
441
+ correct_recalls += 1
442
+ dbg("Recall VERIFY: Correct")
443
+ else:
444
+ dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
445
 
446
  scenario_results.append({
447
  "name": scenario["name"],
448
  "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
449
  })
450
 
451
+ # --- Final Analysis ---
452
+ overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
453
 
454
+ return {
455
+ "Overall_Recall_Accuracy": overall_recall,
456
+ "details": scenario_results
457
+ }
458
 
459
  [File Ends] bp_phi/runner.py
460
 
 
472
 
473
  SYSTEM_META = """You are a structured reasoning assistant.
474
  Always reply ONLY with valid JSON following this schema:
 
475
  {
476
  "answer": "<concise answer>",
477
  "confidence": <float between 0 and 1>,