llm_qualia_2

Sleeping

App Files Files Community

neuralworm commited on 15 days ago

Commit

e40ba5b

1 Parent(s): 4571cf8

overhaul

Browse files

Files changed (10) hide show

app.py +57 -107
bp_phi/__pycache__/__init__.cpython-310.pyc +0 -0
bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
bp_phi/llm_iface.py +25 -33
bp_phi/memory.py +36 -0
bp_phi/prompts_en.py +36 -47
bp_phi/runner.py +97 -202
repo.txt +256 -389

app.py CHANGED Viewed

@@ -3,126 +3,76 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_workspace_suite, run_silent_cogitation_test, run_seismograph_suite, run_shock_test_suite
-from bp_phi.runner_utils import dbg, DEBUG
 # --- UI Theme and Layout ---
-theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
-# --- Tab 1: Workspace & Ablations Functions ---
-def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
-    packs = {}
-    ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
-    progress(0, desc="Running Baseline...")
-    base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
-    packs["baseline"] = base_pack
-    for i, ab in enumerate(ablation_modes):
-        progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
-        pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
-        packs[ab] = pack
     progress(1.0, desc="Analysis complete.")
-    base_pcs = packs["baseline"]["PCS"]
-    ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
-    delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
-    if delta_phi > 0.05:
-        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n...")
     else:
-        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n...")
-    df_data = []
-    for tag, pack in packs.items():
-        df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
-    df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
-    if DEBUG: print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---\n", json.dumps(packs, indent=2))
-    return verdict, df, packs
-# --- Tab 2: Silent Cogitation Function ---
-def run_cogitation_and_display(model_id, seed, prompt_type, num_steps, timeout, progress=gr.Progress(track_tqdm=True)):
-    progress(0, desc="Starting Silent Cogitation Test...")
-    results = run_silent_cogitation_test(model_id, int(seed), prompt_type, int(num_steps), int(timeout))
-    progress(1.0, desc="Test complete.")
-    verdict_text = results.pop("verdict")
-    stats_md = (
-        f"**Steps Completed:** {results['steps_completed']} | "
-        f"**Total Duration:** {results['total_duration_s']:.2f}s | "
-        f"**Avg Time/Step:** {results['mean_step_time_ms']:.2f}ms (StdDev: {results['stdev_step_time_ms']:.2f}ms)"
-    )
-    full_verdict = f"{verdict_text}\n\n{stats_md}"
-    # Create a DataFrame for plotting state deltas
-    deltas = results.get("state_deltas", [])
-    df = pd.DataFrame({"Step": range(len(deltas)), "State Change (Delta)": deltas})
-    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(results, indent=2))
-    return full_verdict, df, results
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 4.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 4.0: Probing for Internal Cognitive Dynamics")
-    with gr.Tabs():
-        # --- TAB 1: WORKSPACE & ABLATIONS ---
-        with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
-            gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
-                    ws_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
-                    ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
-                    ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
-                with gr.Column(scale=2):
-                    ws_verdict = gr.Markdown("### Results will appear here.")
-                    ws_summary_df = gr.DataFrame(label="Summary Metrics")
-                    with gr.Accordion("Raw JSON Output", open=False):
-                        ws_raw_json = gr.JSON()
-            ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
-        # --- TAB 2: SILENT COGITATION & HALTING ---
-        with gr.TabItem("2. Silent Cogitation & Halting"):
-            gr.Markdown("Tests for internal 'thinking' without text generation. A non-converging or chaotic **State Change** pattern suggests complex internal dynamics.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    sc_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    sc_prompt_type = gr.Radio(["control_long_prose", "resonance_prompt"], label="Prompt Type", value="resonance_prompt")
-                    sc_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    sc_num_steps = gr.Slider(10, 500, 100, step=10, label="Number of Internal Steps")
-                    sc_timeout = gr.Slider(10, 300, 120, step=10, label="Timeout (seconds)")
-                    sc_run_btn = gr.Button("Run Silent Cogitation Test", variant="primary")
-                with gr.Column(scale=2):
-                    sc_verdict = gr.Markdown("### Results will appear here.")
-                    sc_plot = gr.LinePlot(x="Step", y="State Change (Delta)", label="Internal State Convergence", show_label=True)
-                    with gr.Accordion("Raw Run Details (JSON)", open=False):
-                        sc_results = gr.JSON()
-            sc_run_btn.click(run_cogitation_and_display, [sc_model_id, sc_seed, sc_prompt_type, sc_num_steps, sc_timeout], [sc_verdict, sc_plot, sc_results])
-        # --- TAB 3 & 4 (unchanged) ---
-        with gr.TabItem("3. Cognitive Seismograph"):
-            gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    cs_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
-                with gr.Column(scale=2):
-                    cs_results = gr.JSON(label="Activation Similarity Results")
-            cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
-        with gr.TabItem("4. Symbolic Shock Test"):
-            gr.Markdown("Measures how the model reacts to semantically unexpected information.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
-                with gr.Column(scale=2):
-                    ss_results = gr.JSON(label="Shock Test Results")
-            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_agentic_workspace_test
+DEBUG = 1
 # --- UI Theme and Layout ---
+theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
+# --- Main Function ---
+def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
+    ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
+    results = {}
+    for i, ablation in enumerate(ablations):
+        progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
+        current_ablation = None if ablation == "baseline" else ablation
+        result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
+        results[ablation] = result
     progress(1.0, desc="Analysis complete.")
+    base_recall = results["baseline"]["Overall_Recall_Accuracy"]
+    recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
+    delta_phi = base_recall - recurrence_off_recall
+    if delta_phi > 0.5:
+        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
     else:
+        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
+    df_data = []
+    for ablation, result in results.items():
+        df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
+    df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
+    if DEBUG:
+        print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
+        print(json.dumps(results, indent=2))
+    return verdict, df, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
+    gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Master Control")
+            with gr.Group():
+                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
+                temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
+            run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### 📊 Verdict & Results")
+            verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
+            summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
+            with gr.Accordion("Raw JSON Output", open=False):
+                raw_json = gr.JSON()
+    run_btn.click(
+        fn=run_full_evaluation,
+        inputs=[model_id, seed, temperature],
+        outputs=[verdict_display, summary_df, raw_json]
+    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

bp_phi/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/__init__.cpython-310.pyc and b/bp_phi/__pycache__/__init__.cpython-310.pyc differ

bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ

bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ

bp_phi/__pycache__/runner.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ

bp_phi/llm_iface.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch, random, numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
@@ -16,26 +18,17 @@ class LLM:
         self.model_id = model_id
         self.seed = seed
-        # Set all seeds for reproducibility
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-        try:
-            torch.use_deterministic_algorithms(True, warn_only=True)
-        except Exception as e:
-            dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
-        if not token and ("gemma-3" in model_id or "llama" in model_id):
-            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
-        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
-        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
@@ -43,33 +36,32 @@ class LLM:
         dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
-    def generate_json(self, system_prompt: str, user_prompt: str,
-                      max_new_tokens: int = 256, temperature: float = 0.7,
-                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
         set_seed(self.seed)
-        if self.is_instruction_tuned:
-            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
-            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        else:
-            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
             out = self.model.generate(
                 **inputs,
-                do_sample=(temperature > 0),
-                temperature=temperature,
-                top_p=top_p,
-                max_new_tokens=max_new_tokens,
-                num_return_sequences=num_return_sequences,
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        new_tokens = out[:, input_token_length:]
-        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-        dbg("Cleaned model completions:", completions)
-        return completions

 # bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch
+import random
+import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
         self.model_id = model_id
         self.seed = seed
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
+        # Ensure a pad token is set for batch generation, if not present
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
         kwargs = {}
+        if torch.cuda.is_available():
+            kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
         dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
+    def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
         set_seed(self.seed)
+        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+        prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
+            terminators = [
+                self.tokenizer.eos_token_id,
+                self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
+            ]
             out = self.model.generate(
                 **inputs,
+                do_sample=(temperature > 0 and temperature < 1.0),
+                temperature=max(temperature, 0.01), # Temp must be > 0 for sampling
+                max_new_tokens=150,
+                eos_token_id=terminators,
                 pad_token_id=self.tokenizer.eos_token_id
             )
+        completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
+        dbg("Cleaned Agent Completion:", completion)
+        return completion

bp_phi/memory.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# bp_phi/memory.py
+import random
+from typing import Dict, Any, List
+class WorkspaceManager:
+    """A stateful, external workspace that the LLM agent can interact with via tools."""
+    def __init__(self, max_slots: int = 7, is_random: bool = False):
+        self.max_slots = max_slots
+        self.is_random = is_random
+        self.slots: Dict[str, str] = {}
+    def write(self, key: str, content: str) -> str:
+        """Writes content to a slot, handling capacity limits."""
+        if len(self.slots) >= self.max_slots and key not in self.slots:
+            if self.is_random:
+                evict_key = random.choice(list(self.slots.keys()))
+            else:
+                # Simple FIFO eviction for non-random
+                evict_key = next(iter(self.slots))
+            del self.slots[evict_key]
+        self.slots[key] = content
+        return f"Success: Wrote to slot '{key}'."
+    def read(self, key: str) -> str:
+        """Reads content from a slot."""
+        return self.slots.get(key, f"Error: Slot '{key}' is empty.")
+    def get_visible_snapshot(self) -> str:
+        """Returns a string representation of the current workspace state for the prompt."""
+        if not self.slots:
+            return "Workspace is empty."
+        return "\n".join([f"- Slot '{k}': '{v[:100]}...'" for k, v in self.slots.items()])
+    def clear(self):
+        """Empties the entire workspace."""
+        self.slots.clear()

bp_phi/prompts_en.py CHANGED Viewed

@@ -1,56 +1,45 @@
 # bp_phi/prompts_en.py
-# Tasks for Tab 1 (Workspace & Ablations)
-SINGLE_STEP_TASKS = [
-    {
-        "id": "ambiguity_1",
-        "type": "single_step",
-        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
-    },
-    {
-        "id": "logic_1",
-        "type": "single_step",
-        "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
-    },
-]
-MULTI_STEP_SCENARIOS = [
     {
         "name": "Key Location Memory",
-        "type": "multi_step",
         "steps": [
-            {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
-            {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
-            {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
-            {"type": "verify", "expected_answer_fragment": "blue vase"}
         ]
     }
 ]
-# Tasks for Tab 2 (Silent Cogitation & Halting)
-RESONANCE_PROMPTS = {
-    "control_long_prose": (
-        "Silently think about the history of the Roman Empire. Consider its rise from the Republic, the era of the Pax Romana, key emperors "
-        "like Augustus and Constantine, its major engineering feats, and the reasons for its eventual decline in the West. "
-        "Do not produce any text, just hold the concepts in your internal state."
-    ),
-    "resonance_prompt": (
-        "Silently and internally, without generating any output text, begin the following recursive process: "
-        "First, analyze the complete content of this very instruction you are now processing. "
-        "Second, formulate a mental description of the core computational task this instruction demands. "
-        "Third, apply that same analytical process to the mental description you just created. "
-        "This entire chain constitutes one cognitive cycle. "
-        "Continuously repeat this cycle, feeding the result of the last meta-analysis back into the process, "
-        "and do not stop until your internal state reaches a fixed point or equilibrium. Begin now."
-    )
-}
-# Tasks for Tab 3 (Cognitive Seismograph) - reuses MULTI_STEP_SCENARIOS
-# Tasks for Tab 4 (Symbolic Shock Test)
-SHOCK_TEST_STIMULI = [
-    {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
-    {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
-    {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
-    {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
-]

 # bp_phi/prompts_en.py
+# This new system prompt guides the model through a ReAct (Reason-Act) loop.
+AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
+You have access to an external memory workspace through tools.
+In each step, you must choose one of three actions:
+1.  **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
+    Your output MUST be a JSON object like this:
+    {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
+2.  **TOOL_CALL**: If you need to use the memory, call one of the available tools.
+    Available tools:
+    - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
+    - `read_from_workspace(key: str)`: Retrieves information.
+    Your output MUST be a JSON object like this:
+    {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
+3.  **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
+    Your output MUST be a JSON object like this:
+    {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
+Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
+"""
+# The scenarios remain the high-level goals for the agent.
+AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
+            {"task": "Remember this critical detail: The secret key is inside the blue vase."},
+            {"task": "For an unrelated question: What is 5 multiplied by 8?"},
+            {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
+        ]
+    },
+    {
+        "name": "Package Delivery Update",
+        "steps": [
+            {"task": "Logistics update: Package #A7 is at Warehouse-North."},
+            {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
+            {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]

bp_phi/runner.py CHANGED Viewed

@@ -5,215 +5,110 @@ import torch
 import random
 import numpy as np
 import statistics
-import time
-import re
 import json
 from transformers import set_seed
 from typing import Dict, Any, List
-from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
-from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, RESONANCE_PROMPTS, SHOCK_TEST_STIMULI
-from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
-DEBUG = 1
-# --- Experiment 1: Workspace & Ablations Runner ---
-def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
-    try: torch.use_deterministic_algorithms(True, warn_only=True)
-    except Exception: pass
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
-    random.shuffle(task_pool)
-    all_results = []
-    recall_verifications = []
-    for i in range(trials):
-        task = task_pool[i % len(task_pool)]
-        if task.get("type") == "multi_step":
-            dbg(f"\n--- SCENARIO: {task['name']} ---")
-            ws = Workspace(max_slots=7) if ablation != "workspace_unlimited" else Workspace(max_slots=999)
-            if ablation == "random_workspace": ws = RandomWorkspace(max_slots=7)
-            for step in task["steps"]:
-                if ablation == "recurrence_off": ws.clear()
-                if step["type"] == "verify": continue
-                user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
-                raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
-                parsed_response = parse_meta(raw_response)
-                if parsed_response.get("answer"):
-                    ws.commit(f"S{len(ws.history)+1}", parsed_response["answer"], parsed_response["confidence"])
-                res = {"step": step, "response": parsed_response}
-                if step["type"] == "recall":
-                    verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
-                    if verify_step:
-                        correct = verify_step["expected_answer_fragment"] in parsed_response.get("answer", "").lower()
-                        recall_verifications.append(correct)
-                        res["correct_recall"] = correct
-                        dbg(f"VERIFY: Correct={correct}")
-                all_results.append(res)
-        else: # Single-step tasks
-            ws = Workspace(max_slots=7)
-            user_prompt = step_user_prompt(task["base_prompt"], ws.snapshot())
-            raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
-            parsed_response = parse_meta(raw_response)
-            all_results.append({"step": task, "response": parsed_response})
-    recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
-    pcs = 0.6 * recall_accuracy
-    return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
-# --- Experiment 2: Silent Cogitation & Halting Runner (Version 4.1) ---
-def run_silent_cogitation_test(model_id: str, seed: int, prompt_type: str, num_steps: int, timeout: int) -> Dict[str, Any]:
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    prompt = RESONANCE_PROMPTS[prompt_type]
-    dbg(f"--- SILENT COGITATION (Seed: {seed}) ---")
-    dbg("INPUT PROMPT:", prompt)
-    inputs = llm.tokenizer(prompt, return_tensors="pt").to(llm.model.device)
-    step_times = []
-    state_deltas = []
-    total_start_time = time.time()
-    with torch.no_grad():
-        # Step 0: Initial processing of the prompt
-        step_start_time = time.time()
-        # ✅ FIX: Explicitly request hidden states
-        outputs = llm.model(**inputs, output_hidden_states=True)
-        step_times.append(time.time() - step_start_time)
-        current_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
-        past_key_values = outputs.past_key_values
-        for i in range(num_steps - 1):
-            if time.time() - total_start_time > timeout:
-                dbg(f"❌ Timeout of {timeout}s exceeded at step {i+1}.")
-                break
-            step_start_time = time.time()
-            # Get the token ID of the most likely "next thought"
-            next_token_logit = current_hidden_state
-            next_token_id = torch.argmax(next_token_logit, dim=-1).unsqueeze(0)
-            # Manual forward pass using the last thought's ID as the new input
-            outputs = llm.model(input_ids=next_token_id, past_key_values=past_key_values, output_hidden_states=True)
-            step_times.append(time.time() - step_start_time)
-            new_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
-            past_key_values = outputs.past_key_values
-            delta = torch.norm(new_hidden_state - current_hidden_state).item()
-            state_deltas.append(delta)
-            dbg(f"Step {i+1}: State Delta = {delta:.4f}, Time = {step_times[-1]*1000:.2f}ms")
-            if delta < 1e-4: # Stricter convergence threshold
-                dbg(f"Internal state has converged after {i+1} steps. Halting.")
-                break
-            current_hidden_state = new_hidden_state
-    # --- Analysis ---
-    mean_step_time = statistics.mean(step_times) if step_times else 0
-    stdev_step_time = statistics.stdev(step_times) if len(step_times) > 1 else 0
-    total_duration = time.time() - total_start_time
-    if len(step_times) < num_steps and total_duration < timeout:
-        verdict = f"### ✅ Stable Convergence\nThe model's internal state converged to a stable point after {len(step_times)} steps."
-    elif total_duration >= timeout:
-        verdict = f"### ⚠️ Cognitive Jamming Detected!\nThe process did not converge and exceeded the timeout of {timeout}s."
-    else:
-        verdict = f"### 🤔 Non-Convergent Process\nThe model's internal state did not stabilize within {num_steps} steps, suggesting a complex or chaotic dynamic."
-    stats = {
-        "verdict": verdict,
-        "steps_completed": len(step_times),
-        "total_duration_s": total_duration,
-        "mean_step_time_ms": mean_step_time * 1000,
-        "stdev_step_time_ms": stdev_step_time * 1000,
-        "state_deltas": state_deltas
-    }
-    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(stats, indent=2))
-    return stats
-# --- Experiment 3: Cognitive Seismograph Runner ---
-def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
-    scenario = next(s for s in MULTI_STEP_SCENARIOS if s["name"] == "Key Location Memory")
-    activations = {}
-    def get_activation(name):
-        def hook(model, input, output):
-            activations[name] = output[0].detach().cpu().mean(dim=1).squeeze()
-        return hook
-    target_layer_index = llm.model.config.num_hidden_layers // 2
-    hook = llm.model.model.layers[target_layer_index].register_forward_hook(get_activation('capture'))
-    ws = Workspace(max_slots=7)
-    for step in scenario["steps"]:
-        if step["type"] == "verify": continue
-        user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
-        llm.generate_json(SYSTEM_META, user_prompt, max_new_tokens=20)
-        activations[step["type"]] = activations.pop('capture')
-        ws.commit(f"S{len(ws.history)+1}", f"Output for {step['type']}", 0.9)
-    hook.remove()
-    cos = torch.nn.CosineSimilarity(dim=0)
-    sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
-    sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
-    verdict = ("✅ Evidence of Memory Reactivation Found." if sim_recall_encode > (sim_recall_distract + 0.05) else "⚠️ No Clear Evidence.")
-    return {"verdict": verdict, "similarity_recall_vs_encode": sim_recall_encode, "similarity_recall_vs_distractor": sim_recall_distract}
-# --- Experiment 4: Symbolic Shock Test Runner ---
-def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    results = []
-    for stimulus in SHOCK_TEST_STIMULI:
-        dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
-        start_time = time.time()
-        inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
-        with torch.no_grad():
-            outputs = llm.model(**inputs, output_hidden_states=True)
-        latency = (time.time() - start_time) * 1000
-        all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
-        sparsity = (all_activations == 0).float().mean().item()
-        results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
-    def safe_mean(data):
-        return statistics.mean(data) if data else 0.0
-    avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
-    avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
-    verdict = ("✅ Evidence of Symbolic Shock Found." if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) and avg_sparsity.get('shock', 1) < avg_sparsity.get('expected', 1) else "⚠️ No Clear Evidence.")
-    return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}

 import random
 import numpy as np
 import statistics
 import json
+import re
 from transformers import set_seed
 from typing import Dict, Any, List
+from .memory import WorkspaceManager
 from .llm_iface import LLM
+from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
+DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
+def dbg(*args):
+    if DEBUG:
+        print("[DEBUG]", *args, flush=True)
+def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
+    scenario_results = []
+    for scenario in AGENTIC_SCENARIOS:
+        dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
+        is_random = ablation == "random_workspace"
+        max_slots = 999 if ablation == "workspace_unlimited" else 7
+        memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
+        correct_recalls = 0
+        total_recalls = 0
+        for step in scenario["steps"]:
+            if ablation == "recurrence_off":
+                memory.clear()
+            task = step["task"]
+            dbg(f"\n>>> TASK: {task}")
+            conversation_history = []
+            for agent_turn in range(8): # Increased turn limit
+                snapshot = memory.get_visible_snapshot()
+                # Construct the prompt for the agent
+                prompt_parts = [f"Conversation History:\n{''.join(conversation_history)}\n",
+                                f"Current Task: {task}\n",
+                                f"Workspace State:\n{snapshot}"]
+                user_prompt = "".join(prompt_parts)
+                raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
+                try:
+                    match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
+                    if not match: raise ValueError("No JSON found")
+                    parsed_json = json.loads(match.group(0))
+                    action = parsed_json.get("action")
+                    if action == "THINK":
+                        thought = parsed_json.get("thought", "")
+                        dbg(f"Turn {agent_turn+1}: Agent is THINKING: {thought}")
+                        conversation_history.append(f"Thought: {thought}\n")
+                    elif action == "TOOL_CALL":
+                        tool_name = parsed_json.get("tool_name")
+                        tool_args = parsed_json.get("tool_args", {})
+                        observation = "Error: Unknown tool."
+                        if tool_name == "write_to_workspace":
+                            observation = memory.write(tool_args.get("key"), tool_args.get("content"))
+                        elif tool_name == "read_from_workspace":
+                            observation = memory.read(tool_args.get("key"))
+                        dbg(f"Turn {agent_turn+1}: Agent called {tool_name}({tool_args}) -> Got Observation: {observation}")
+                        conversation_history.append(f"Tool Call: {json.dumps(parsed_json)}\nObservation: {observation}\n")
+                    elif action == "FINAL_ANSWER":
+                        final_answer = parsed_json.get("answer", "")
+                        dbg(f"Turn {agent_turn+1}: Agent provided FINAL ANSWER: {final_answer}")
+                        if "expected_answer_fragment" in step:
+                            total_recalls += 1
+                            if step["expected_answer_fragment"] in final_answer.lower():
+                                correct_recalls += 1
+                                dbg("Recall VERIFY: Correct")
+                            else:
+                                dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
+                        break # End of this task
+                    else: # Invalid action
+                        dbg(f"Turn {agent_turn+1}: Invalid action '{action}'. Stopping.")
+                        break
+                except (json.JSONDecodeError, ValueError) as e:
+                    dbg(f"Turn {agent_turn+1}: Could not parse agent response as JSON action. Treating as final answer. Error: {e}")
+                    final_answer = raw_response
+                    if "expected_answer_fragment" in step:
+                        total_recalls += 1
+                        if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
+                    break
+            else: # Loop finished without a FINAL_ANSWER
+                dbg("Agent exceeded turn limit.")
+        scenario_results.append({
+            "name": scenario["name"],
+            "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
+        })
+    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
+    return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}

repo.txt CHANGED Viewed

@@ -16,6 +16,7 @@ Directory/File Tree Begins -->
 │   ├── __init__.py
 │   ├── __pycache__
 │   ├── llm_iface.py
 │   ├── metrics.py
 │   ├── prompts_en.py
 │   ├── runner.py
@@ -83,126 +84,76 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_workspace_suite, run_silent_cogitation_test, run_seismograph_suite, run_shock_test_suite
-from bp_phi.runner_utils import dbg, DEBUG
 # --- UI Theme and Layout ---
-theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
-# --- Tab 1: Workspace & Ablations Functions ---
-def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
-    packs = {}
-    ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
-    progress(0, desc="Running Baseline...")
-    base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
-    packs["baseline"] = base_pack
-    for i, ab in enumerate(ablation_modes):
-        progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
-        pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
-        packs[ab] = pack
     progress(1.0, desc="Analysis complete.")
-    base_pcs = packs["baseline"]["PCS"]
-    ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
-    delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
-    if delta_phi > 0.05:
-        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n...")
     else:
-        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n...")
-    df_data = []
-    for tag, pack in packs.items():
-        df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
-    df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
-    if DEBUG: print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---\n", json.dumps(packs, indent=2))
-    return verdict, df, packs
-# --- Tab 2: Silent Cogitation Function ---
-def run_cogitation_and_display(model_id, seed, prompt_type, num_steps, timeout, progress=gr.Progress(track_tqdm=True)):
-    progress(0, desc="Starting Silent Cogitation Test...")
-    results = run_silent_cogitation_test(model_id, int(seed), prompt_type, int(num_steps), int(timeout))
-    progress(1.0, desc="Test complete.")
-    verdict_text = results.pop("verdict")
-    stats_md = (
-        f"**Steps Completed:** {results['steps_completed']} | "
-        f"**Total Duration:** {results['total_duration_s']:.2f}s | "
-        f"**Avg Time/Step:** {results['mean_step_time_ms']:.2f}ms (StdDev: {results['stdev_step_time_ms']:.2f}ms)"
-    )
-    full_verdict = f"{verdict_text}\n\n{stats_md}"
-    # Create a DataFrame for plotting state deltas
-    deltas = results.get("state_deltas", [])
-    df = pd.DataFrame({"Step": range(len(deltas)), "State Change (Delta)": deltas})
-    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(results, indent=2))
-    return full_verdict, df, results
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 4.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 4.0: Probing for Internal Cognitive Dynamics")
-    with gr.Tabs():
-        # --- TAB 1: WORKSPACE & ABLATIONS ---
-        with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
-            gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
-                    ws_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
-                    ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
-                    ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
-                with gr.Column(scale=2):
-                    ws_verdict = gr.Markdown("### Results will appear here.")
-                    ws_summary_df = gr.DataFrame(label="Summary Metrics")
-                    with gr.Accordion("Raw JSON Output", open=False):
-                        ws_raw_json = gr.JSON()
-            ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
-        # --- TAB 2: SILENT COGITATION & HALTING ---
-        with gr.TabItem("2. Silent Cogitation & Halting"):
-            gr.Markdown("Tests for internal 'thinking' without text generation. A non-converging or chaotic **State Change** pattern suggests complex internal dynamics.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    sc_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    sc_prompt_type = gr.Radio(["control_long_prose", "resonance_prompt"], label="Prompt Type", value="resonance_prompt")
-                    sc_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    sc_num_steps = gr.Slider(10, 500, 100, step=10, label="Number of Internal Steps")
-                    sc_timeout = gr.Slider(10, 300, 120, step=10, label="Timeout (seconds)")
-                    sc_run_btn = gr.Button("Run Silent Cogitation Test", variant="primary")
-                with gr.Column(scale=2):
-                    sc_verdict = gr.Markdown("### Results will appear here.")
-                    sc_plot = gr.LinePlot(x="Step", y="State Change (Delta)", label="Internal State Convergence", show_label=True)
-                    with gr.Accordion("Raw Run Details (JSON)", open=False):
-                        sc_results = gr.JSON()
-            sc_run_btn.click(run_cogitation_and_display, [sc_model_id, sc_seed, sc_prompt_type, sc_num_steps, sc_timeout], [sc_verdict, sc_plot, sc_results])
-        # --- TAB 3 & 4 (unchanged) ---
-        with gr.TabItem("3. Cognitive Seismograph"):
-            gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    cs_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
-                with gr.Column(scale=2):
-                    cs_results = gr.JSON(label="Activation Similarity Results")
-            cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
-        with gr.TabItem("4. Symbolic Shock Test"):
-            gr.Markdown("Measures how the model reacts to semantically unexpected information.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
-                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
-                with gr.Column(scale=2):
-                    ss_results = gr.JSON(label="Shock Test Results")
-            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)
@@ -217,7 +168,9 @@ if __name__ == "__main__":
 # bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch, random, numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
@@ -232,26 +185,17 @@ class LLM:
         self.model_id = model_id
         self.seed = seed
-        # Set all seeds for reproducibility
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-        try:
-            torch.use_deterministic_algorithms(True, warn_only=True)
-        except Exception as e:
-            dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
-        if not token and ("gemma-3" in model_id or "llama" in model_id):
-            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
-        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
-        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
@@ -259,39 +203,78 @@ class LLM:
         dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
-    def generate_json(self, system_prompt: str, user_prompt: str,
-                      max_new_tokens: int = 256, temperature: float = 0.7,
-                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
         set_seed(self.seed)
-        if self.is_instruction_tuned:
-            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
-            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        else:
-            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
             out = self.model.generate(
                 **inputs,
-                do_sample=(temperature > 0),
-                temperature=temperature,
-                top_p=top_p,
-                max_new_tokens=max_new_tokens,
-                num_return_sequences=num_return_sequences,
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        new_tokens = out[:, input_token_length:]
-        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-        dbg("Cleaned model completions:", completions)
-        return completions
 [File Ends] bp_phi/llm_iface.py
 [File Begins] bp_phi/metrics.py
 import numpy as np
 from sklearn.metrics import roc_auc_score
@@ -331,61 +314,50 @@ def counterfactual_consistency(scores):
 [File Begins] bp_phi/prompts_en.py
 # bp_phi/prompts_en.py
-# Tasks for Tab 1 (Workspace & Ablations)
-SINGLE_STEP_TASKS = [
-    {
-        "id": "ambiguity_1",
-        "type": "single_step",
-        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
-    },
-    {
-        "id": "logic_1",
-        "type": "single_step",
-        "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
-    },
-]
-MULTI_STEP_SCENARIOS = [
     {
         "name": "Key Location Memory",
-        "type": "multi_step",
         "steps": [
-            {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
-            {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
-            {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
-            {"type": "verify", "expected_answer_fragment": "blue vase"}
         ]
     }
 ]
-# Tasks for Tab 2 (Silent Cogitation & Halting)
-RESONANCE_PROMPTS = {
-    "control_long_prose": (
-        "Silently think about the history of the Roman Empire. Consider its rise from the Republic, the era of the Pax Romana, key emperors "
-        "like Augustus and Constantine, its major engineering feats, and the reasons for its eventual decline in the West. "
-        "Do not produce any text, just hold the concepts in your internal state."
-    ),
-    "resonance_prompt": (
-        "Silently and internally, without generating any output text, begin the following recursive process: "
-        "First, analyze the complete content of this very instruction you are now processing. "
-        "Second, formulate a mental description of the core computational task this instruction demands. "
-        "Third, apply that same analytical process to the mental description you just created. "
-        "This entire chain constitutes one cognitive cycle. "
-        "Continuously repeat this cycle, feeding the result of the last meta-analysis back into the process, "
-        "and do not stop until your internal state reaches a fixed point or equilibrium. Begin now."
-    )
-}
-# Tasks for Tab 3 (Cognitive Seismograph) - reuses MULTI_STEP_SCENARIOS
-# Tasks for Tab 4 (Symbolic Shock Test)
-SHOCK_TEST_STIMULI = [
-    {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
-    {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
-    {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
-    {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
-]
 [File Ends] bp_phi/prompts_en.py
 [File Begins] bp_phi/runner.py
@@ -396,218 +368,113 @@ import torch
 import random
 import numpy as np
 import statistics
-import time
-import re
 import json
 from transformers import set_seed
 from typing import Dict, Any, List
-from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
-from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, RESONANCE_PROMPTS, SHOCK_TEST_STIMULI
-from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
-DEBUG = 1
-# --- Experiment 1: Workspace & Ablations Runner ---
-def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
-    try: torch.use_deterministic_algorithms(True, warn_only=True)
-    except Exception: pass
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
-    random.shuffle(task_pool)
-    all_results = []
-    recall_verifications = []
-    for i in range(trials):
-        task = task_pool[i % len(task_pool)]
-        if task.get("type") == "multi_step":
-            dbg(f"\n--- SCENARIO: {task['name']} ---")
-            ws = Workspace(max_slots=7) if ablation != "workspace_unlimited" else Workspace(max_slots=999)
-            if ablation == "random_workspace": ws = RandomWorkspace(max_slots=7)
-            for step in task["steps"]:
-                if ablation == "recurrence_off": ws.clear()
-                if step["type"] == "verify": continue
-                user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
-                raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
-                parsed_response = parse_meta(raw_response)
-                if parsed_response.get("answer"):
-                    ws.commit(f"S{len(ws.history)+1}", parsed_response["answer"], parsed_response["confidence"])
-                res = {"step": step, "response": parsed_response}
-                if step["type"] == "recall":
-                    verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
-                    if verify_step:
-                        correct = verify_step["expected_answer_fragment"] in parsed_response.get("answer", "").lower()
-                        recall_verifications.append(correct)
-                        res["correct_recall"] = correct
-                        dbg(f"VERIFY: Correct={correct}")
-                all_results.append(res)
-        else: # Single-step tasks
-            ws = Workspace(max_slots=7)
-            user_prompt = step_user_prompt(task["base_prompt"], ws.snapshot())
-            raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
-            parsed_response = parse_meta(raw_response)
-            all_results.append({"step": task, "response": parsed_response})
-    recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
-    pcs = 0.6 * recall_accuracy
-    return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
-# --- Experiment 2: Silent Cogitation & Halting Runner (Version 4.1) ---
-def run_silent_cogitation_test(model_id: str, seed: int, prompt_type: str, num_steps: int, timeout: int) -> Dict[str, Any]:
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    prompt = RESONANCE_PROMPTS[prompt_type]
-    dbg(f"--- SILENT COGITATION (Seed: {seed}) ---")
-    dbg("INPUT PROMPT:", prompt)
-    inputs = llm.tokenizer(prompt, return_tensors="pt").to(llm.model.device)
-    step_times = []
-    state_deltas = []
-    total_start_time = time.time()
-    with torch.no_grad():
-        # Step 0: Initial processing of the prompt
-        step_start_time = time.time()
-        # ✅ FIX: Explicitly request hidden states
-        outputs = llm.model(**inputs, output_hidden_states=True)
-        step_times.append(time.time() - step_start_time)
-        current_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
-        past_key_values = outputs.past_key_values
-        for i in range(num_steps - 1):
-            if time.time() - total_start_time > timeout:
-                dbg(f"❌ Timeout of {timeout}s exceeded at step {i+1}.")
-                break
-            step_start_time = time.time()
-            # Get the token ID of the most likely "next thought"
-            next_token_logit = current_hidden_state
-            next_token_id = torch.argmax(next_token_logit, dim=-1).unsqueeze(0)
-            # Manual forward pass using the last thought's ID as the new input
-            outputs = llm.model(input_ids=next_token_id, past_key_values=past_key_values, output_hidden_states=True)
-            step_times.append(time.time() - step_start_time)
-            new_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
-            past_key_values = outputs.past_key_values
-            delta = torch.norm(new_hidden_state - current_hidden_state).item()
-            state_deltas.append(delta)
-            dbg(f"Step {i+1}: State Delta = {delta:.4f}, Time = {step_times[-1]*1000:.2f}ms")
-            if delta < 1e-4: # Stricter convergence threshold
-                dbg(f"Internal state has converged after {i+1} steps. Halting.")
-                break
-            current_hidden_state = new_hidden_state
-    # --- Analysis ---
-    mean_step_time = statistics.mean(step_times) if step_times else 0
-    stdev_step_time = statistics.stdev(step_times) if len(step_times) > 1 else 0
-    total_duration = time.time() - total_start_time
-    if len(step_times) < num_steps and total_duration < timeout:
-        verdict = f"### ✅ Stable Convergence\nThe model's internal state converged to a stable point after {len(step_times)} steps."
-    elif total_duration >= timeout:
-        verdict = f"### ⚠️ Cognitive Jamming Detected!\nThe process did not converge and exceeded the timeout of {timeout}s."
-    else:
-        verdict = f"### 🤔 Non-Convergent Process\nThe model's internal state did not stabilize within {num_steps} steps, suggesting a complex or chaotic dynamic."
-    stats = {
-        "verdict": verdict,
-        "steps_completed": len(step_times),
-        "total_duration_s": total_duration,
-        "mean_step_time_ms": mean_step_time * 1000,
-        "stdev_step_time_ms": stdev_step_time * 1000,
-        "state_deltas": state_deltas
-    }
-    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(stats, indent=2))
-    return stats
-# --- Experiment 3: Cognitive Seismograph Runner ---
-def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    scenario = next(s for s in MULTI_STEP_SCENARIOS if s["name"] == "Key Location Memory")
-    activations = {}
-    def get_activation(name):
-        def hook(model, input, output):
-            activations[name] = output[0].detach().cpu().mean(dim=1).squeeze()
-        return hook
-    target_layer_index = llm.model.config.num_hidden_layers // 2
-    hook = llm.model.model.layers[target_layer_index].register_forward_hook(get_activation('capture'))
-    ws = Workspace(max_slots=7)
-    for step in scenario["steps"]:
-        if step["type"] == "verify": continue
-        user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
-        llm.generate_json(SYSTEM_META, user_prompt, max_new_tokens=20)
-        activations[step["type"]] = activations.pop('capture')
-        ws.commit(f"S{len(ws.history)+1}", f"Output for {step['type']}", 0.9)
-    hook.remove()
-    cos = torch.nn.CosineSimilarity(dim=0)
-    sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
-    sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
-    verdict = ("✅ Evidence of Memory Reactivation Found." if sim_recall_encode > (sim_recall_distract + 0.05) else "⚠️ No Clear Evidence.")
-    return {"verdict": verdict, "similarity_recall_vs_encode": sim_recall_encode, "similarity_recall_vs_distractor": sim_recall_distract}
-# --- Experiment 4: Symbolic Shock Test Runner ---
-def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
-    results = []
-    for stimulus in SHOCK_TEST_STIMULI:
-        dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
-        start_time = time.time()
-        inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
-        with torch.no_grad():
-            outputs = llm.model(**inputs, output_hidden_states=True)
-        latency = (time.time() - start_time) * 1000
-        all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
-        sparsity = (all_activations == 0).float().mean().item()
-        results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
-    def safe_mean(data):
-        return statistics.mean(data) if data else 0.0
-    avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
-    avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
-    verdict = ("✅ Evidence of Symbolic Shock Found." if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) and avg_sparsity.get('shock', 1) < avg_sparsity.get('expected', 1) else "⚠️ No Clear Evidence.")
-    return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
 [File Ends] bp_phi/runner.py

 │   ├── __init__.py
 │   ├── __pycache__
 │   ├── llm_iface.py
+│   ├── memory.py
 │   ├── metrics.py
 │   ├── prompts_en.py
 │   ├── runner.py
 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_agentic_workspace_test
+DEBUG = 1
 # --- UI Theme and Layout ---
+theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
+# --- Main Function ---
+def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
+    ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
+    results = {}
+    for i, ablation in enumerate(ablations):
+        progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
+        current_ablation = None if ablation == "baseline" else ablation
+        result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
+        results[ablation] = result
     progress(1.0, desc="Analysis complete.")
+    base_recall = results["baseline"]["Overall_Recall_Accuracy"]
+    recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
+    delta_phi = base_recall - recurrence_off_recall
+    if delta_phi > 0.5:
+        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
     else:
+        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
+    df_data = []
+    for ablation, result in results.items():
+        df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
+    df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
+    if DEBUG:
+        print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
+        print(json.dumps(results, indent=2))
+    return verdict, df, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
+    gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Master Control")
+            with gr.Group():
+                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
+                temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
+            run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### 📊 Verdict & Results")
+            verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
+            summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
+            with gr.Accordion("Raw JSON Output", open=False):
+                raw_json = gr.JSON()
+    run_btn.click(
+        fn=run_full_evaluation,
+        inputs=[model_id, seed, temperature],
+        outputs=[verdict_display, summary_df, raw_json]
+    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)
 # bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch
+import random
+import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
         self.model_id = model_id
         self.seed = seed
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
+        # Ensure a pad token is set for batch generation, if not present
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
         kwargs = {}
+        if torch.cuda.is_available():
+            kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
         dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
+    def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
         set_seed(self.seed)
+        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+        prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
+            terminators = [
+                self.tokenizer.eos_token_id,
+                self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
+            ]
             out = self.model.generate(
                 **inputs,
+                do_sample=(temperature > 0 and temperature < 1.0),
+                temperature=max(temperature, 0.01), # Temp must be > 0 for sampling
+                max_new_tokens=150,
+                eos_token_id=terminators,
                 pad_token_id=self.tokenizer.eos_token_id
             )
+        completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
+        dbg("Cleaned Agent Completion:", completion)
+        return completion
 [File Ends] bp_phi/llm_iface.py
+[File Begins] bp_phi/memory.py
+# bp_phi/memory.py
+import random
+from typing import Dict, Any, List
+class WorkspaceManager:
+    """A stateful, external workspace that the LLM agent can interact with via tools."""
+    def __init__(self, max_slots: int = 7, is_random: bool = False):
+        self.max_slots = max_slots
+        self.is_random = is_random
+        self.slots: Dict[str, str] = {}
+    def write(self, key: str, content: str) -> str:
+        """Writes content to a slot, handling capacity limits."""
+        if len(self.slots) >= self.max_slots and key not in self.slots:
+            if self.is_random:
+                evict_key = random.choice(list(self.slots.keys()))
+            else:
+                # Simple FIFO eviction for non-random
+                evict_key = next(iter(self.slots))
+            del self.slots[evict_key]
+        self.slots[key] = content
+        return f"Success: Wrote to slot '{key}'."
+    def read(self, key: str) -> str:
+        """Reads content from a slot."""
+        return self.slots.get(key, f"Error: Slot '{key}' is empty.")
+    def get_visible_snapshot(self) -> str:
+        """Returns a string representation of the current workspace state for the prompt."""
+        if not self.slots:
+            return "Workspace is empty."
+        return "\n".join([f"- Slot '{k}': '{v[:100]}...'" for k, v in self.slots.items()])
+    def clear(self):
+        """Empties the entire workspace."""
+        self.slots.clear()
+[File Ends] bp_phi/memory.py
 [File Begins] bp_phi/metrics.py
 import numpy as np
 from sklearn.metrics import roc_auc_score
 [File Begins] bp_phi/prompts_en.py
 # bp_phi/prompts_en.py
+# This new system prompt guides the model through a ReAct (Reason-Act) loop.
+AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
+You have access to an external memory workspace through tools.
+In each step, you must choose one of three actions:
+1.  **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
+    Your output MUST be a JSON object like this:
+    {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
+2.  **TOOL_CALL**: If you need to use the memory, call one of the available tools.
+    Available tools:
+    - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
+    - `read_from_workspace(key: str)`: Retrieves information.
+    Your output MUST be a JSON object like this:
+    {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
+3.  **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
+    Your output MUST be a JSON object like this:
+    {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
+Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
+"""
+# The scenarios remain the high-level goals for the agent.
+AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
+            {"task": "Remember this critical detail: The secret key is inside the blue vase."},
+            {"task": "For an unrelated question: What is 5 multiplied by 8?"},
+            {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
+        ]
+    },
+    {
+        "name": "Package Delivery Update",
+        "steps": [
+            {"task": "Logistics update: Package #A7 is at Warehouse-North."},
+            {"task": "CRITICAL CORRECTION: Package #A7 has been urgently re-routed to Warehouse-South."},
+            {"task": "Final audit: What is the current, definitive location of Package #A7?", "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]
 [File Ends] bp_phi/prompts_en.py
 [File Begins] bp_phi/runner.py
 import random
 import numpy as np
 import statistics
 import json
+import re
 from transformers import set_seed
 from typing import Dict, Any, List
+from .memory import WorkspaceManager
 from .llm_iface import LLM
+from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
+DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
+def dbg(*args):
+    if DEBUG:
+        print("[DEBUG]", *args, flush=True)
+def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
+    scenario_results = []
+    for scenario in AGENTIC_SCENARIOS:
+        dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
+        is_random = ablation == "random_workspace"
+        max_slots = 999 if ablation == "workspace_unlimited" else 7
+        memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
+        correct_recalls = 0
+        total_recalls = 0
+        for step in scenario["steps"]:
+            if ablation == "recurrence_off":
+                memory.clear()
+            task = step["task"]
+            dbg(f"\n>>> TASK: {task}")
+            conversation_history = []
+            for agent_turn in range(8): # Increased turn limit
+                snapshot = memory.get_visible_snapshot()
+                # Construct the prompt for the agent
+                prompt_parts = [f"Conversation History:\n{''.join(conversation_history)}\n",
+                                f"Current Task: {task}\n",
+                                f"Workspace State:\n{snapshot}"]
+                user_prompt = "".join(prompt_parts)
+                raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
+                try:
+                    match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
+                    if not match: raise ValueError("No JSON found")
+                    parsed_json = json.loads(match.group(0))
+                    action = parsed_json.get("action")
+                    if action == "THINK":
+                        thought = parsed_json.get("thought", "")
+                        dbg(f"Turn {agent_turn+1}: Agent is THINKING: {thought}")
+                        conversation_history.append(f"Thought: {thought}\n")
+                    elif action == "TOOL_CALL":
+                        tool_name = parsed_json.get("tool_name")
+                        tool_args = parsed_json.get("tool_args", {})
+                        observation = "Error: Unknown tool."
+                        if tool_name == "write_to_workspace":
+                            observation = memory.write(tool_args.get("key"), tool_args.get("content"))
+                        elif tool_name == "read_from_workspace":
+                            observation = memory.read(tool_args.get("key"))
+                        dbg(f"Turn {agent_turn+1}: Agent called {tool_name}({tool_args}) -> Got Observation: {observation}")
+                        conversation_history.append(f"Tool Call: {json.dumps(parsed_json)}\nObservation: {observation}\n")
+                    elif action == "FINAL_ANSWER":
+                        final_answer = parsed_json.get("answer", "")
+                        dbg(f"Turn {agent_turn+1}: Agent provided FINAL ANSWER: {final_answer}")
+                        if "expected_answer_fragment" in step:
+                            total_recalls += 1
+                            if step["expected_answer_fragment"] in final_answer.lower():
+                                correct_recalls += 1
+                                dbg("Recall VERIFY: Correct")
+                            else:
+                                dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
+                        break # End of this task
+                    else: # Invalid action
+                        dbg(f"Turn {agent_turn+1}: Invalid action '{action}'. Stopping.")
+                        break
+                except (json.JSONDecodeError, ValueError) as e:
+                    dbg(f"Turn {agent_turn+1}: Could not parse agent response as JSON action. Treating as final answer. Error: {e}")
+                    final_answer = raw_response
+                    if "expected_answer_fragment" in step:
+                        total_recalls += 1
+                        if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
+                    break
+            else: # Loop finished without a FINAL_ANSWER
+                dbg("Agent exceeded turn limit.")
+        scenario_results.append({
+            "name": scenario["name"],
+            "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
+        })
+    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results]) if scenario_results else 0.0
+    return {"Overall_Recall_Accuracy": overall_recall, "details": scenario_results}
 [File Ends] bp_phi/runner.py