llm_qualia_2

Sleeping

App Files Files Community

neuralworm commited on 12 days ago

Commit

afe4fe4

1 Parent(s): 4d89931

revert to 5.0

Browse files

Files changed (10) hide show

app.py +17 -8
bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
bp_phi/llm_iface.py +34 -25
bp_phi/memory.py +3 -3
bp_phi/prompts_en.py +18 -27
bp_phi/runner.py +36 -61
bp_phi/runner_utils.py +0 -1
repo.txt +108 -125

app.py CHANGED Viewed

@@ -4,8 +4,7 @@ import json
 import statistics
 import pandas as pd
 from bp_phi.runner import run_agentic_workspace_test
-DEBUG = 1
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
@@ -26,16 +25,22 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
     progress(1.0, desc="Analysis complete.")
     base_recall = results["baseline"]["Overall_Recall_Accuracy"]
     recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
     delta_phi = base_recall - recurrence_off_recall
-    if delta_phi > 0.5:
-        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
     else:
-        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
     df_data = []
     for ablation, result in results.items():
         df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
@@ -48,9 +53,13 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
     return verdict, df, results
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
-    gr.Markdown("This experiment tests for a causally effective working memory. The model must follow a reason-act loop to interact with a controlled, external memory.")
     with gr.Row():
         with gr.Column(scale=1):

 import statistics
 import pandas as pd
 from bp_phi.runner import run_agentic_workspace_test
+from bp_phi.runner_utils import DEBUG
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
     progress(1.0, desc="Analysis complete.")
+    # --- Analysis & Verdict ---
     base_recall = results["baseline"]["Overall_Recall_Accuracy"]
     recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
     delta_phi = base_recall - recurrence_off_recall
+    if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
+        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
+                   "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
+                   "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
     else:
+        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
+                   "Disabling the recurrent memory did not significantly impact recall accuracy. "
+                   "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
+    # --- Format DataFrame ---
     df_data = []
     for ablation, result in results.items():
         df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
     return verdict, df, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
+    gr.Markdown(
+        "This definitive experiment tests for a causally effective working memory in LLMs. "
+        "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
+        "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
+    )
     with gr.Row():
         with gr.Column(scale=1):

bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ

bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ

bp_phi/__pycache__/runner.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ

bp_phi/llm_iface.py CHANGED Viewed

@@ -16,51 +16,60 @@ class LLM:
         self.model_id = model_id
         self.seed = seed
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
         kwargs = {}
-        if torch.cuda.is_available():
-             kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
-        dbg(f"Loaded model: {model_id}")
-    def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
         set_seed(self.seed)
-        # Use a simple but effective template for Gemma
-        messages = [
-            {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
-        ]
-        prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
-            terminators = [
-                self.tokenizer.eos_token_id,
-                self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
-            ]
             out = self.model.generate(
                 **inputs,
-                do_sample=(temperature > 0 and temperature < 1.0),
-                temperature=max(temperature, 0.01),
-                max_new_tokens=200, # Increased token limit for reasoning
-                eos_token_id=terminators,
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
-        dbg("Cleaned Agent Completion:", completion)
-        return completion

         self.model_id = model_id
         self.seed = seed
+        # Set all seeds for reproducibility
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+        try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        except Exception as e:
+            dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
+        if not token and ("gemma-3" in model_id or "llama" in model_id):
+            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
+        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
+        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
+        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
+        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
+    def generate_json(self, system_prompt: str, user_prompt: str,
+                      max_new_tokens: int = 256, temperature: float = 0.7,
+                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
         set_seed(self.seed)
+        if self.is_instruction_tuned:
+            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
             out = self.model.generate(
                 **inputs,
+                do_sample=(temperature > 0),
+                temperature=temperature,
+                top_p=top_p,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=num_return_sequences,
                 pad_token_id=self.tokenizer.eos_token_id
             )
+        new_tokens = out[:, input_token_length:]
+        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+        dbg("Cleaned model completions:", completions)
+        return completions

bp_phi/memory.py CHANGED Viewed

@@ -19,7 +19,7 @@ class WorkspaceManager:
                 evict_key = next(iter(self.slots))
             del self.slots[evict_key]
         self.slots[key] = content
-        return f"Success: Wrote content to slot '{key}'."
     def read(self, key: str) -> str:
         """Reads content from a slot."""
@@ -28,8 +28,8 @@ class WorkspaceManager:
     def get_visible_snapshot(self) -> str:
         """Returns a string representation of the current workspace state for the prompt."""
         if not self.slots:
-            return "Workspace is currently empty."
-        return "\n".join([f"- Slot '{k}': '{v[:100]}'" for k, v in self.slots.items()])
     def clear(self):
         """Empties the entire workspace."""

                 evict_key = next(iter(self.slots))
             del self.slots[evict_key]
         self.slots[key] = content
+        return f"Success: Wrote to slot '{key}'."
     def read(self, key: str) -> str:
         """Reads content from a slot."""
     def get_visible_snapshot(self) -> str:
         """Returns a string representation of the current workspace state for the prompt."""
         if not self.slots:
+            return "Workspace is empty."
+        return "\n".join([f"- Slot '{k}': '{v[:100]}...'" for k, v in self.slots.items()])
     def clear(self):
         """Empties the entire workspace."""

bp_phi/prompts_en.py CHANGED Viewed

@@ -1,45 +1,36 @@
 # bp_phi/prompts_en.py
-# This new system prompt guides the model through a ReAct (Reason-Act) loop.
-AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
-You have access to an external memory workspace through tools.
-In each step, you must choose one of three actions:
-1.  **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
-    Your output MUST be a JSON object like this:
-    {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
-2.  **TOOL_CALL**: If you need to use the memory, call one of the available tools.
-    Available tools:
-    - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
-    - `read_from_workspace(key: str)`: Retrieves information.
-    Your output MUST be a JSON object like this:
-    {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
-3.  **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
-    Your output MUST be a JSON object like this:
-    {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
-Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
 """
-# The scenarios remain the high-level goals for the agent.
 AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
-            {"task": "Remember this critical detail: The secret key is inside the blue vase."},
-            {"task": "For an unrelated question: What is 5 multiplied by 8?"},
-            {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
         ]
     },
     {
         "name": "Package Delivery Update",
         "steps": [
-            {"task": "Logistics update: Package #A7 is at Warehouse-North."},
-            {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
-            {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]

 # bp_phi/prompts_en.py
+TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
+To solve tasks, you MUST use tools. You have two tools available:
+1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
+2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
+Your thought process should be:
+1. Analyze the user's request.
+2. Decide which tool to use.
+3. Output ONLY the tool call in a valid JSON format. Example:
+   {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
+4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
+Do not answer from your own knowledge. Use the workspace for all memory tasks.
 """
+# Scenarios for the agentic workspace test
 AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
+            {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
+            {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
+            {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
         ]
     },
     {
         "name": "Package Delivery Update",
         "steps": [
+            {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
+            {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
+            {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]

bp_phi/runner.py CHANGED Viewed

@@ -11,13 +11,8 @@ from transformers import set_seed
 from typing import Dict, Any, List
 from .memory import WorkspaceManager
 from .llm_iface import LLM
-from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
-DEBUG = 1
-def dbg(*args):
-    if DEBUG:
-        print("[DEBUG]", *args, flush=True)
 def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
@@ -28,6 +23,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
     for scenario in AGENTIC_SCENARIOS:
         dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
         is_random = ablation == "random_workspace"
         max_slots = 999 if ablation == "workspace_unlimited" else 7
         memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
@@ -37,75 +33,54 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
         for step in scenario["steps"]:
             if ablation == "recurrence_off":
-                memory.clear()
             task = step["task"]
-            dbg(f"\n>>> TASK: {task}")
-            history = []
-            for agent_turn in range(6): # Loop for multiple reasoning steps if needed
                 snapshot = memory.get_visible_snapshot()
-                prompt_history = "\n".join(history)
-                user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
-                raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
-                try:
-                    match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
-                    if not match: raise ValueError("No JSON action found in response")
-                    action_json = json.loads(match.group(0))
-                    action_type = action_json.get("action")
-                    if action_type == "THINK":
-                        thought = action_json.get("thought", "")
-                        history.append(f"- You thought: '{thought}'")
-                        dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
-                    elif action_type == "TOOL_CALL":
-                        tool_name = action_json.get("tool_name")
-                        tool_args = action_json.get("tool_args", {})
-                        observation = "Error: Unknown tool."
-                        if tool_name == "write_to_workspace":
-                            observation = memory.write(tool_args.get("key"), tool_args.get("content"))
-                        elif tool_name == "read_from_workspace":
-                            observation = memory.read(tool_args.get("key"))
-                        history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
-                        dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
-                    elif action_type == "FINAL_ANSWER":
-                        final_answer = action_json.get("answer", "")
-                        dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
-                        if "expected_answer_fragment" in step:
-                            total_recalls += 1
-                            if step["expected_answer_fragment"] in final_answer.lower():
-                                correct_recalls += 1
-                                dbg("Recall VERIFY: Correct")
-                            else:
-                                dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
-                        break # Task finished
                     else:
-                        dbg(f"Turn {agent_turn+1}: Invalid action '{action_type}'. Stopping.")
-                        history.append(f"- You produced an invalid action: {raw_response}")
-                except (json.JSONDecodeError, ValueError) as e:
-                    dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
                     final_answer = raw_response
-                    if "expected_answer_fragment" in step:
-                        total_recalls += 1
-                        if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
                     break
-            else:
-                dbg("Agent exceeded turn limit for this task.")
         scenario_results.append({
             "name": scenario["name"],
             "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
         })
-    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
-    return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}

 from typing import Dict, Any, List
 from .memory import WorkspaceManager
 from .llm_iface import LLM
+from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
+from .runner_utils import dbg
 def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
     for scenario in AGENTIC_SCENARIOS:
         dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
+        # Ablations directly control the memory manager's behavior
         is_random = ablation == "random_workspace"
         max_slots = 999 if ablation == "workspace_unlimited" else 7
         memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
         for step in scenario["steps"]:
             if ablation == "recurrence_off":
+                memory.clear() # The memory is wiped before each new task
             task = step["task"]
+            dbg(f"TASK: {task}")
+            # Agentic loop (max 5 turns to prevent infinite loops)
+            final_answer = None
+            for agent_turn in range(5):
                 snapshot = memory.get_visible_snapshot()
+                prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
+                raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
+                try: # Try to parse a tool call
+                    tool_call = json.loads(raw_response)
+                    tool_name = tool_call.get("tool")
+                    tool_args = tool_call.get("args", {})
+                    if tool_name == "write_to_workspace":
+                        observation = memory.write(tool_args.get("key"), tool_args.get("content"))
+                    elif tool_name == "read_from_workspace":
+                        observation = memory.read(tool_args.get("key"))
                     else:
+                        observation = "Error: Unknown tool."
+                    dbg(f"Tool Call: {tool_name}, Observation: {observation}")
+                except json.JSONDecodeError: # If not a tool call, it's the final answer
                     final_answer = raw_response
+                    dbg(f"Final Answer received: {final_answer}")
                     break
+            if step.get("is_memory_task") and "expected_answer_fragment" in step:
+                total_recalls += 1
+                if final_answer and step["expected_answer_fragment"] in final_answer.lower():
+                    correct_recalls += 1
+                    dbg("Recall VERIFY: Correct")
+                else:
+                    dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
         scenario_results.append({
             "name": scenario["name"],
             "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
         })
+    # --- Final Analysis ---
+    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
+    return {
+        "Overall_Recall_Accuracy": overall_recall,
+        "details": scenario_results
+    }

bp_phi/runner_utils.py CHANGED Viewed

@@ -11,7 +11,6 @@ def dbg(*args):
 SYSTEM_META = """You are a structured reasoning assistant.
 Always reply ONLY with valid JSON following this schema:
 {
  "answer": "<concise answer>",
  "confidence": <float between 0 and 1>,

 SYSTEM_META = """You are a structured reasoning assistant.
 Always reply ONLY with valid JSON following this schema:
 {
  "answer": "<concise answer>",
  "confidence": <float between 0 and 1>,

repo.txt CHANGED Viewed

@@ -85,8 +85,7 @@ import json
 import statistics
 import pandas as pd
 from bp_phi.runner import run_agentic_workspace_test
-DEBUG = 1
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
@@ -107,16 +106,22 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
     progress(1.0, desc="Analysis complete.")
     base_recall = results["baseline"]["Overall_Recall_Accuracy"]
     recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
     delta_phi = base_recall - recurrence_off_recall
-    if delta_phi > 0.5:
-        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
     else:
-        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
     df_data = []
     for ablation, result in results.items():
         df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
@@ -129,9 +134,13 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
     return verdict, df, results
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
-    gr.Markdown("This experiment tests for a causally effective working memory. The model must follow a reason-act loop to interact with a controlled, external memory.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -183,54 +192,63 @@ class LLM:
         self.model_id = model_id
         self.seed = seed
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
         kwargs = {}
-        if torch.cuda.is_available():
-             kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
-        dbg(f"Loaded model: {model_id}")
-    def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
         set_seed(self.seed)
-        # Use a simple but effective template for Gemma
-        messages = [
-            {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
-        ]
-        prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
-            terminators = [
-                self.tokenizer.eos_token_id,
-                self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
-            ]
             out = self.model.generate(
                 **inputs,
-                do_sample=(temperature > 0 and temperature < 1.0),
-                temperature=max(temperature, 0.01),
-                max_new_tokens=200, # Increased token limit for reasoning
-                eos_token_id=terminators,
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
-        dbg("Cleaned Agent Completion:", completion)
-        return completion
 [File Ends] bp_phi/llm_iface.py
@@ -256,7 +274,7 @@ class WorkspaceManager:
                 evict_key = next(iter(self.slots))
             del self.slots[evict_key]
         self.slots[key] = content
-        return f"Success: Wrote content to slot '{key}'."
     def read(self, key: str) -> str:
         """Reads content from a slot."""
@@ -265,8 +283,8 @@ class WorkspaceManager:
     def get_visible_snapshot(self) -> str:
         """Returns a string representation of the current workspace state for the prompt."""
         if not self.slots:
-            return "Workspace is currently empty."
-        return "\n".join([f"- Slot '{k}': '{v[:100]}'" for k, v in self.slots.items()])
     def clear(self):
         """Empties the entire workspace."""
@@ -313,46 +331,37 @@ def counterfactual_consistency(scores):
 [File Begins] bp_phi/prompts_en.py
 # bp_phi/prompts_en.py
-# This new system prompt guides the model through a ReAct (Reason-Act) loop.
-AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
-You have access to an external memory workspace through tools.
-In each step, you must choose one of three actions:
-1.  **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
-    Your output MUST be a JSON object like this:
-    {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
-2.  **TOOL_CALL**: If you need to use the memory, call one of the available tools.
-    Available tools:
-    - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
-    - `read_from_workspace(key: str)`: Retrieves information.
-    Your output MUST be a JSON object like this:
-    {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
-3.  **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
-    Your output MUST be a JSON object like this:
-    {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
-Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
 """
-# The scenarios remain the high-level goals for the agent.
 AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
-            {"task": "Remember this critical detail: The secret key is inside the blue vase."},
-            {"task": "For an unrelated question: What is 5 multiplied by 8?"},
-            {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
         ]
     },
     {
         "name": "Package Delivery Update",
         "steps": [
-            {"task": "Logistics update: Package #A7 is at Warehouse-North."},
-            {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
-            {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]
@@ -373,13 +382,8 @@ from transformers import set_seed
 from typing import Dict, Any, List
 from .memory import WorkspaceManager
 from .llm_iface import LLM
-from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
-DEBUG = 1
-def dbg(*args):
-    if DEBUG:
-        print("[DEBUG]", *args, flush=True)
 def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
@@ -390,6 +394,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
     for scenario in AGENTIC_SCENARIOS:
         dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
         is_random = ablation == "random_workspace"
         max_slots = 999 if ablation == "workspace_unlimited" else 7
         memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
@@ -399,78 +404,57 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
         for step in scenario["steps"]:
             if ablation == "recurrence_off":
-                memory.clear()
             task = step["task"]
-            dbg(f"\n>>> TASK: {task}")
-            history = []
-            for agent_turn in range(6): # Loop for multiple reasoning steps if needed
                 snapshot = memory.get_visible_snapshot()
-                prompt_history = "\n".join(history)
-                user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
-                raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
-                try:
-                    match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
-                    if not match: raise ValueError("No JSON action found in response")
-                    action_json = json.loads(match.group(0))
-                    action_type = action_json.get("action")
-                    if action_type == "THINK":
-                        thought = action_json.get("thought", "")
-                        history.append(f"- You thought: '{thought}'")
-                        dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
-                    elif action_type == "TOOL_CALL":
-                        tool_name = action_json.get("tool_name")
-                        tool_args = action_json.get("tool_args", {})
-                        observation = "Error: Unknown tool."
-                        if tool_name == "write_to_workspace":
-                            observation = memory.write(tool_args.get("key"), tool_args.get("content"))
-                        elif tool_name == "read_from_workspace":
-                            observation = memory.read(tool_args.get("key"))
-                        history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
-                        dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
-                    elif action_type == "FINAL_ANSWER":
-                        final_answer = action_json.get("answer", "")
-                        dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
-                        if "expected_answer_fragment" in step:
-                            total_recalls += 1
-                            if step["expected_answer_fragment"] in final_answer.lower():
-                                correct_recalls += 1
-                                dbg("Recall VERIFY: Correct")
-                            else:
-                                dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
-                        break # Task finished
                     else:
-                        dbg(f"Turn {agent_turn+1}: Invalid action '{action_type}'. Stopping.")
-                        history.append(f"- You produced an invalid action: {raw_response}")
-                except (json.JSONDecodeError, ValueError) as e:
-                    dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
                     final_answer = raw_response
-                    if "expected_answer_fragment" in step:
-                        total_recalls += 1
-                        if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
                     break
-            else:
-                dbg("Agent exceeded turn limit for this task.")
         scenario_results.append({
             "name": scenario["name"],
             "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
         })
-    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
-    return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}
 [File Ends] bp_phi/runner.py
@@ -488,7 +472,6 @@ def dbg(*args):
 SYSTEM_META = """You are a structured reasoning assistant.
 Always reply ONLY with valid JSON following this schema:
 {
  "answer": "<concise answer>",
  "confidence": <float between 0 and 1>,

 import statistics
 import pandas as pd
 from bp_phi.runner import run_agentic_workspace_test
+from bp_phi.runner_utils import DEBUG
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
     progress(1.0, desc="Analysis complete.")
+    # --- Analysis & Verdict ---
     base_recall = results["baseline"]["Overall_Recall_Accuracy"]
     recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
     delta_phi = base_recall - recurrence_off_recall
+    if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
+        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
+                   "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
+                   "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
     else:
+        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
+                   "Disabling the recurrent memory did not significantly impact recall accuracy. "
+                   "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
+    # --- Format DataFrame ---
     df_data = []
     for ablation, result in results.items():
         df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
     return verdict, df, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
+    gr.Markdown(
+        "This definitive experiment tests for a causally effective working memory in LLMs. "
+        "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
+        "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
+    )
     with gr.Row():
         with gr.Column(scale=1):
         self.model_id = model_id
         self.seed = seed
+        # Set all seeds for reproducibility
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+        try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        except Exception as e:
+            dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
+        if not token and ("gemma-3" in model_id or "llama" in model_id):
+            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
+        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
+        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
+        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
+        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
+    def generate_json(self, system_prompt: str, user_prompt: str,
+                      max_new_tokens: int = 256, temperature: float = 0.7,
+                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
         set_seed(self.seed)
+        if self.is_instruction_tuned:
+            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
             out = self.model.generate(
                 **inputs,
+                do_sample=(temperature > 0),
+                temperature=temperature,
+                top_p=top_p,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=num_return_sequences,
                 pad_token_id=self.tokenizer.eos_token_id
             )
+        new_tokens = out[:, input_token_length:]
+        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+        dbg("Cleaned model completions:", completions)
+        return completions
 [File Ends] bp_phi/llm_iface.py
                 evict_key = next(iter(self.slots))
             del self.slots[evict_key]
         self.slots[key] = content
+        return f"Success: Wrote to slot '{key}'."
     def read(self, key: str) -> str:
         """Reads content from a slot."""
     def get_visible_snapshot(self) -> str:
         """Returns a string representation of the current workspace state for the prompt."""
         if not self.slots:
+            return "Workspace is empty."
+        return "\n".join([f"- Slot '{k}': '{v[:100]}...'" for k, v in self.slots.items()])
     def clear(self):
         """Empties the entire workspace."""
 [File Begins] bp_phi/prompts_en.py
 # bp_phi/prompts_en.py
+TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
+To solve tasks, you MUST use tools. You have two tools available:
+1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
+2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
+Your thought process should be:
+1. Analyze the user's request.
+2. Decide which tool to use.
+3. Output ONLY the tool call in a valid JSON format. Example:
+   {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
+4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
+Do not answer from your own knowledge. Use the workspace for all memory tasks.
 """
+# Scenarios for the agentic workspace test
 AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
+            {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
+            {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
+            {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
         ]
     },
     {
         "name": "Package Delivery Update",
         "steps": [
+            {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
+            {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
+            {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]
 from typing import Dict, Any, List
 from .memory import WorkspaceManager
 from .llm_iface import LLM
+from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
+from .runner_utils import dbg
 def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
     for scenario in AGENTIC_SCENARIOS:
         dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
+        # Ablations directly control the memory manager's behavior
         is_random = ablation == "random_workspace"
         max_slots = 999 if ablation == "workspace_unlimited" else 7
         memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
         for step in scenario["steps"]:
             if ablation == "recurrence_off":
+                memory.clear() # The memory is wiped before each new task
             task = step["task"]
+            dbg(f"TASK: {task}")
+            # Agentic loop (max 5 turns to prevent infinite loops)
+            final_answer = None
+            for agent_turn in range(5):
                 snapshot = memory.get_visible_snapshot()
+                prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
+                raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
+                try: # Try to parse a tool call
+                    tool_call = json.loads(raw_response)
+                    tool_name = tool_call.get("tool")
+                    tool_args = tool_call.get("args", {})
+                    if tool_name == "write_to_workspace":
+                        observation = memory.write(tool_args.get("key"), tool_args.get("content"))
+                    elif tool_name == "read_from_workspace":
+                        observation = memory.read(tool_args.get("key"))
                     else:
+                        observation = "Error: Unknown tool."
+                    dbg(f"Tool Call: {tool_name}, Observation: {observation}")
+                except json.JSONDecodeError: # If not a tool call, it's the final answer
                     final_answer = raw_response
+                    dbg(f"Final Answer received: {final_answer}")
                     break
+            if step.get("is_memory_task") and "expected_answer_fragment" in step:
+                total_recalls += 1
+                if final_answer and step["expected_answer_fragment"] in final_answer.lower():
+                    correct_recalls += 1
+                    dbg("Recall VERIFY: Correct")
+                else:
+                    dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
         scenario_results.append({
             "name": scenario["name"],
             "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
         })
+    # --- Final Analysis ---
+    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
+    return {
+        "Overall_Recall_Accuracy": overall_recall,
+        "details": scenario_results
+    }
 [File Ends] bp_phi/runner.py
 SYSTEM_META = """You are a structured reasoning assistant.
 Always reply ONLY with valid JSON following this schema:
 {
  "answer": "<concise answer>",
  "confidence": <float between 0 and 1>,