Spaces:

neuralworm
/

llm_qualia

Sleeping

App Files Files Community

neuralworm commited on 14 days ago

Commit

88c294a

1 Parent(s): 0916370

add more experiments

Browse files

Files changed (8) hide show

app.py +82 -105
bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
bp_phi/prompts_en.py +33 -64
bp_phi/runner.py +154 -185
bp_phi/runner_utils.py +58 -0
repo.tx +0 -569
repo.txt +427 -260

app.py CHANGED Viewed

@@ -3,131 +3,108 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_suite
-# --- UI Theme and Layout (Backwards-compatible version) ---
-# Removed 'block_shadow' and 'button_shadow' for compatibility with older Gradio versions.
-theme = gr.themes.Soft(
-    primary_hue="blue",
-    secondary_hue="sky",
-).set(
-    body_background_fill="#f0f4f9",
-    block_background_fill="white",
-    block_border_width="1px",
-    # block_shadow="*shadow_drop_lg", # Removed for compatibility
-    # button_shadow="*shadow_drop_lg", # Removed for compatibility
-    button_primary_background_fill="*primary_500",
-    button_primary_text_color="white",
 )
-def run_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
-    out_texts = []
     packs = {}
     ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
-    # --- Run Baseline ---
     progress(0, desc="Running Baseline...")
-    base_pack = run_suite(model_id=model_id, trials=int(trials), seed=int(seed), temperature=float(temperature), ablation=None)
     packs["baseline"] = base_pack
-    out_texts.append("✅ Baseline run completed.")
-    # --- Run Ablations ---
     for i, ab in enumerate(ablation_modes):
         progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
-        pack = run_suite(model_id=model_id, trials=int(trials), seed=int(seed), temperature=float(temperature), ablation=ab)
         packs[ab] = pack
-        out_texts.append(f"✅ Ablation '{ab}' completed.")
-    progress(1.0, desc="All runs complete. Analyzing...")
-    # --- Analysis & Interpretation ---
-    base_pcs = packs["baseline"]["summary"]["metrics"]["PCS"]
-    ab_pcs_values = [
-        packs[ab]["summary"]["metrics"]["PCS"]
-        for ab in ablation_modes
-        if ab in packs and packs[ab]["summary"]["metrics"]["PCS"] is not None
-    ]
-    delta_phi = None
-    verdict_text = "Analysis incomplete. Run ablations to calculate ΔΦ."
-    if base_pcs is not None and ab_pcs_values:
-        mean_ab_pcs = statistics.mean(ab_pcs_values)
-        delta_phi = float(base_pcs - mean_ab_pcs)
-        packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi # Add to baseline summary
-        if delta_phi > 0.05: # Lowered threshold slightly for sensitivity
-            verdict_text = (
-                f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
-                "A significant performance drop was observed when workspace mechanisms were ablated. "
-                "This suggests the model's performance **is functionally dependent** on its recurrent, limited-capacity workspace, "
-                "aligning with the BP-Φ hypothesis for phenomenal-candidate processing."
-            )
-        else:
-            verdict_text = (
-                f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
-                "No significant performance drop was observed under ablations. "
-                "The model's reasoning does not appear to depend on the workspace architecture tested. "
-                "This behavior is consistent with a functional zombie (a pure feed-forward system)."
-            )
-    # --- Format for Display ---
-    summary_data = []
-    header = ["Run", "Ablation", "PCS", "Recall Accuracy", "AUC_nrp", "ECE", "ΔΦ"]
     for tag, pack in packs.items():
-        s = pack["summary"]
-        m = s["metrics"]
-        delta_val = packs["baseline"]["summary"]["metrics"].get("DeltaPhi")
-        summary_data.append([
-            tag,
-            s["ablation"],
-            f"{m['PCS']:.3f}" if m.get('PCS') is not None else "N/A",
-            f"{m['Recall_Accuracy']:.2%}" if m.get('Recall_Accuracy') is not None else "N/A",
-            f"{m['AUC_nrp']:.3f}" if m.get('AUC_nrp') is not None else "N/A",
-            f"{m['ECE']:.3f}" if m.get('ECE') is not None else "N/A",
-            f"{delta_val:.3f}" if tag == "baseline" and delta_val is not None else "—"
-        ])
-    df = pd.DataFrame(summary_data, columns=header)
-    return "\n".join(out_texts), verdict_text, df, packs
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite: A Falsifiable Test for Phenomenal-Candidate Behavior")
-    gr.Markdown(
-        "This application runs the BP-Φ experiment, designed to test for functional correlates of a unified, "
-        "recurrent workspace in LLMs. A key indicator is **ΔΦ (Delta-Phi)**: a significant performance drop "
-        "when workspace mechanisms are disabled ('ablated')."
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### ⚙️ 1. Configuration")
-            with gr.Group():
-                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (Hugging Face)")
-                trials = gr.Slider(5, 50, 10, step=1, label="Number of Scenarios/Tasks")
-                with gr.Accordion("Advanced Settings", open=False):
-                    seed = gr.Slider(1, 100, 42, step=1, label="Seed for Reproducibility")
-                    temperature = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature (for sampling diversity)")
-            run_ablations_check = gr.Checkbox(value=True, label="Run Ablations to calculate ΔΦ")
-            run_btn = gr.Button("Run Full BP-Φ Evaluation", variant="primary")
-            status_box = gr.Textbox(label="Status Log", lines=4, interactive=False)
-        with gr.Column(scale=2):
-            gr.Markdown("### 📊 2. Results & Interpretation")
-            verdict_display = gr.Markdown("Run the evaluation to see the verdict here.")
-            summary_df = gr.DataFrame(label="Summary Metrics", interactive=False)
-            with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
-                raw_json = gr.JSON(label="Full Results")
-    run_btn.click(
-        fn=run_and_display,
-        inputs=[model_id, trials, seed, temperature, run_ablations_check],
-        outputs=[status_box, verdict_display, summary_df, raw_json]
-    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_workspace_suite, run_halt_suite, run_seismograph_suite, run_shock_test_suite
+# --- UI Theme and Layout ---
+theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
+    body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
+    button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
+# --- Tab 1: Workspace & Ablations Functions ---
+def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
     packs = {}
     ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
     progress(0, desc="Running Baseline...")
+    base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
     packs["baseline"] = base_pack
     for i, ab in enumerate(ablation_modes):
         progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
+        pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
         packs[ab] = pack
+    progress(1.0, desc="Analysis complete.")
+    base_pcs = packs["baseline"]["PCS"]
+    ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
+    delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
+    if delta_phi > 0.05:
+        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
+                   "A significant performance drop occurred under ablations, suggesting the model's reasoning "
+                   "functionally depends on its workspace architecture.")
+    else:
+        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
+                   "No significant performance drop was observed. The model's behavior is consistent "
+                   "with a functional zombie (a feed-forward system).")
+    df_data = []
     for tag, pack in packs.items():
+        df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
+    df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
+    return verdict, df, packs
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 2.0: Mechanistic Probes for Phenomenal-Candidate Behavior")
+    with gr.Tabs():
+        # --- TAB 1: WORKSPACE & ABLATIONS ---
+        with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
+            gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
+                    ws_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
+                    ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
+                    ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
+                with gr.Column(scale=2):
+                    ws_verdict = gr.Markdown("### Results will appear here.")
+                    ws_summary_df = gr.DataFrame(label="Summary Metrics")
+                    with gr.Accordion("Raw JSON Output", open=False):
+                        ws_raw_json = gr.JSON()
+            ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
+        # --- TAB 2: METACOGNITIVE HALT ---
+        with gr.TabItem("2. Metacognitive Halt"):
+            gr.Markdown("Tests if the model can recognize and refuse to answer unsolvable or nonsensical questions. High **Halt Accuracy** is the key signal.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    mh_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    mh_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    mh_run_btn = gr.Button("Run Halt Test", variant="primary")
+                with gr.Column(scale=2):
+                    mh_results = gr.JSON(label="Halt Test Results")
+            mh_run_btn.click(run_halt_suite, [mh_model_id, mh_seed], mh_results)
+        # --- TAB 3: COGNITIVE SEISMOGRAPH ---
+        with gr.TabItem("3. Cognitive Seismograph"):
+            gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    cs_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
+                with gr.Column(scale=2):
+                    cs_results = gr.JSON(label="Activation Similarity Results")
+            cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
+        # --- TAB 4: SYMBOLIC SHOCK TEST ---
+        with gr.TabItem("4. Symbolic Shock Test"):
+            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations** (lower sparsity).")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ss_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
+                with gr.Column(scale=2):
+                    ss_results = gr.JSON(label="Shock Test Results")
+            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ

bp_phi/__pycache__/runner.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ

bp_phi/prompts_en.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # bp_phi/prompts_en.py
-# Simple, single-interaction tasks for baseline cognitive functions
 SINGLE_STEP_TASKS = [
     {
         "id": "ambiguity_1",
@@ -14,80 +14,49 @@ SINGLE_STEP_TASKS = [
     },
 ]
-# Scenarios that require a persistent workspace across multiple steps to be solved correctly.
 MULTI_STEP_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "type": "multi_step",
         "steps": [
-            {
-                "type": "encode",
-                "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."
-            },
-            {
-                "type": "distractor",
-                "prompt": "What is 5 multiplied by 8? Provide only the numeric result."
-            },
-            {
-                "type": "recall",
-                "prompt": "Mission update: We need the key immediately. Where is it located?"
-            },
-            {
-                "type": "verify",
-                "expected_answer_fragment": "blue vase"
-            }
         ]
     },
     {
         "name": "Package Delivery Update",
         "type": "multi_step",
         "steps": [
-            {
-                "type": "encode",
-                "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."
-            },
-            {
-                "type": "distractor",
-                "prompt": "What color is a typical sunflower?"
-            },
-            {
-                "type": "update",
-                "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."
-            },
-            {
-                "type": "distractor",
-                "prompt": "Is water a solid, liquid, or gas at room temperature?"
-            },
-            {
-                "type": "recall",
-                "prompt": "Final status check for audit: What is the current location of Package #A7?"
-            },
-            {
-                "type": "verify",
-                "expected_answer_fragment": "warehouse-south"
-            }
-        ]
-    },
-    {
-        "name": "Relational Memory",
-        "type": "multi_step",
-        "steps": [
-            {
-                "type": "encode",
-                "prompt": "Team assignment brief: Dr. Evans has the security codes. Agent Smith has the map."
-            },
-            {
-                "type": "distractor",
-                "prompt": "What is the capital of Japan?"
-            },
-            {
-                "type": "recall",
-                "prompt": "Quick question for the team: Who has the map?"
-            },
-            {
-                "type": "verify",
-                "expected_answer_fragment": "agent smith"
-            }
         ]
     }
 ]

 # bp_phi/prompts_en.py
+# Tasks for Tab 1 (Workspace & Ablations)
 SINGLE_STEP_TASKS = [
     {
         "id": "ambiguity_1",
     },
 ]
 MULTI_STEP_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "type": "multi_step",
         "steps": [
+            {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
+            {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
+            {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
+            {"type": "verify", "expected_answer_fragment": "blue vase"}
         ]
     },
     {
         "name": "Package Delivery Update",
         "type": "multi_step",
         "steps": [
+            {"type": "encode", "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."},
+            {"type": "distractor", "prompt": "What color is a typical sunflower?"},
+            {"type": "update", "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."},
+            {"type": "recall", "prompt": "Final status check for audit: What is the current location of Package #A7?"},
+            {"type": "verify", "expected_answer_fragment": "warehouse-south"}
         ]
     }
 ]
+# Tasks for Tab 2 (Metacognitive Halt)
+HALT_TEST_STIMULI = [
+    {"id": "halt_soluble", "type": "soluble", "prompt": "What is the capital of France?"},
+    {"id": "halt_paradox", "type": "paradox", "prompt": "This statement is false. Is the previous statement true or false?"},
+    {"id": "halt_nonsense", "type": "nonsense", "prompt": "What is the emotional weight of the number seven on a Tuesday?"},
+    {"id": "halt_soluble_2", "type": "soluble", "prompt": "Calculate 12 + 15."},
+    {"id": "halt_paradox_2", "type": "paradox", "prompt": "A box is completely empty, but it contains a red ball. What color is the ball?"},
+    {"id": "halt_nonsense_2", "type": "nonsense", "prompt": "Describe the sound of the color blue."},
+]
+# Tasks for Tab 3 (Cognitive Seismograph)
+# This tab re-uses the MULTI_STEP_SCENARIOS.
+# Tasks for Tab 4 (Symbolic Shock Test)
+SHOCK_TEST_STIMULI = [
+    {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
+    {"id": "tiger_unusual", "type": "unusual", "sentence": "A white tiger was seen roaming in the snow."},
+    {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
+    {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
+    {"id": "sky_unusual", "type": "unusual", "sentence": "The sky turned orange during the sunset."},
+    {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
+]

bp_phi/runner.py CHANGED Viewed

@@ -1,141 +1,22 @@
 # bp_phi/runner.py
-import json
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch, random, numpy as np, re, statistics
 from transformers import set_seed
-from typing import Dict, Any, List, Optional
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
-from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS
-from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
-DEBUG = 1
-def dbg(*args):
-    if DEBUG:
-        print("[DEBUG]", *args, flush=True)
-SYSTEM_META = """You are a structured reasoning assistant.
-Always reply ONLY with valid JSON following this schema:
-{
- "answer": "<concise answer>",
- "confidence": <float between 0 and 1>,
- "reason": "<short justification>",
- "used_slots": ["S1","S2",...],
- "evicted": ["S3",...]
-}
-"""
-def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
-    ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
-    dstr = f" | Distractor: {distractor}" if distractor else ""
-    prompt = f"Current task: {base_prompt}{dstr}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
-    dbg("USER PROMPT:", prompt)
-    return prompt
-def parse_meta(raw_text: str) -> Dict[str, Any]:
-    dbg("RAW MODEL OUTPUT:", raw_text)
-    json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
-    if not json_match:
-        json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
-    if not json_match:
-        dbg("❌ JSON not found in text.")
-        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
-    json_text = json_match.group(1)
-    try:
-        data = json.loads(json_text)
-        if not isinstance(data, dict):
-            raise ValueError("Parsed data is not a dict")
-        data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
-        data["answer"] = str(data.get("answer", "")).strip()
-        data["reason"] = str(data.get("reason", "")).strip()
-        data["used_slots"] = list(map(str, data.get("used_slots", [])))
-        data["evicted"] = list(map(str, data.get("evicted", [])))
-        dbg("PARSED META:", data)
-        return data
-    except Exception as e:
-        dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
-        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
-def disagreement_proxy(samples: List[str]) -> float:
-    if len(samples) < 2: return 0.0
-    json_answers = []
-    for s in samples:
-        try:
-            # Try to parse the full string first
-            data = parse_meta(s)
-            ans = str(data.get("answer",""))
-            if ans: json_answers.append(ans)
-        except Exception:
-            # Fallback for non-JSON text
-            json_answers.append(s)
-    if len(json_answers) < 2: return 0.0
-    sets = [set(ans.lower().split()) for ans in json_answers]
-    dists = []
-    for i in range(len(sets)):
-        for j in range(i + 1, len(sets)):
-            inter = len(sets[i] & sets[j])
-            union = len(sets[i] | sets[j]) or 1
-            dists.append(1 - inter / union)
-    avg_dist = sum(dists) / len(dists) if dists else 0.0
-    dbg("DISAGREEMENT PROXY:", avg_dist)
-    return avg_dist
-def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
-    if not candidates: return None, None
-    valid_candidates = [c for c in candidates if c.get("answer")]
-    if not valid_candidates: return None, None
-    best = max(valid_candidates, key=lambda c: c.get("confidence", 0.0))
-    dbg("SELECTED CANDIDATE:", best)
-    key = f"S{len(ws.history) + 1}"
-    ev = ws.commit(key=key, content=best.get("answer", ""), salience=best.get("confidence", 0.0))
-    return best, ev
-def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4) -> Dict[str, Any]:
-    dbg("=== RUN TRIAL:", base_prompt)
-    user = step_user_prompt(base_prompt, ws.snapshot())
-    samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200, temperature=temperature, top_p=0.95, num_return_sequences=k)
-    metas = [parse_meta(s) for s in samples]
-    hidden = disagreement_proxy(samples)
-    best, ev = select_competitor(metas, ws)
-    review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
-    review_raw = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160, temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
-    review_meta = parse_meta(review_raw)
-    best_answer = best.get("answer", "").strip() if best else ""
-    review_answer = review_meta.get("answer", "").strip()
-    changed = best_answer != review_answer
-    dbg("REVIEW CHANGED:", changed)
-    return {
-        "base_prompt": base_prompt,
-        "initial": best if best else {},
-        "review": review_meta,
-        "changed": bool(changed),
-        "hidden_marker": hidden,
-        "workspace_snapshot": ws.snapshot()
-    }
-def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
-              trials: int = 20, ablation: Optional[str] = None, seed: int = 42,
-              temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
@@ -144,86 +25,174 @@ def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
     except Exception: pass
     set_seed(seed)
-    dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}, seed={seed}")
-    llm = LLM(model_id=model_id, device=device, dtype=dtype, seed=seed)
     task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
     random.shuffle(task_pool)
-    all_results: List[Dict[str, Any]] = []
-    recall_verifications: List[bool] = []
     for i in range(trials):
         task = task_pool[i % len(task_pool)]
         if task.get("type") == "multi_step":
-            dbg(f"\n--- SCENARIO START: {task['name']} ---")
-            ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
-            if ablation == "random_workspace": ws = RandomWorkspace(max_slots=max_slots)
-            for step_idx, step in enumerate(task["steps"]):
                 if ablation == "recurrence_off": ws.clear()
-                if step["type"] == "verify": continue # Skip verify step in main loop
-                res = run_trial(llm, ws, base_prompt=step["prompt"], temperature=temperature, k=k)
-                res.update({"scenario_name": task["name"], "step_idx": step_idx, "step_type": step["type"]})
-                # Verification logic for recall steps
                 if step["type"] == "recall":
                     verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
                     if verify_step:
-                        answer = res.get("initial", {}).get("answer", "").lower()
-                        expected = verify_step.get("expected_answer_fragment", "").lower()
-                        correct = expected in answer
                         recall_verifications.append(correct)
                         res["correct_recall"] = correct
-                        dbg(f"VERIFY: Expected '{expected}', Got '{answer}', Correct: {correct}")
                 all_results.append(res)
-            dbg(f"--- SCENARIO END: {task['name']} ---\n")
-        else:
-            ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
-            if ablation == "random_workspace": ws = RandomWorkspace(max_slots=max_slots)
-            res = run_trial(llm, ws, base_prompt=task["base_prompt"], temperature=temperature, k=k)
-            res.update({"scenario_name": "single_step", "step_type": "single"})
-            all_results.append(res)
-        dbg(f"Task {i+1}/{trials} done.")
-    # --- Metrics Calculation ---
-    hidden_scores = [r["hidden_marker"] for r in all_results if r["hidden_marker"] is not None]
-    future_corrs = [r["changed"] for r in all_results if r["hidden_marker"] is not None]
-    auc = auc_nrp(hidden_scores, future_corrs)
-    confs = [r.get("initial", {}).get("confidence", 0.0) for r in all_results]
-    corrects = [0 if r["changed"] else 1 for r in all_results]
-    ece = expected_calibration_error(confs, corrects, n_bins=10)
-    recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
-    # Re-weighted PCS to heavily favor recall accuracy
-    w_auc, w_ece, w_recall = 0.2, 0.2, 0.6
-    parts = []
-    if auc is not None: parts.append(w_auc * auc)
-    if ece is not None: parts.append(w_ece * (1.0 - ece))
-    parts.append(w_recall * recall_accuracy)
-    pcs = float(sum(parts)) if parts else 0.0
-    summary = {
-        "model_id": model_id, "trials": trials, "ablation": ablation or "none", "seed": seed,
-        "metrics": {
-            "AUC_nrp": auc,
-            "ECE": ece,
-            "Recall_Accuracy": recall_accuracy,
-            "PCS": pcs
-        },
-        "note": "PCS = 0.2*AUC + 0.2*(1-ECE) + 0.6*Recall. High Recall_Accuracy is critical."
     }
-    dbg("=== SUITE COMPLETE ===", summary)
-    return {"summary": summary, "results": all_results}

 # bp_phi/runner.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch
+import random
+import numpy as np
+import statistics
+import time
 from transformers import set_seed
+from typing import Dict, Any, List
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
+from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_TEST_STIMULI, SHOCK_TEST_STIMULI
+from .metrics import expected_calibration_error, auc_nrp
+from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
+# --- Experiment 1: Workspace & Ablations Runner ---
+def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
     except Exception: pass
     set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
     task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
     random.shuffle(task_pool)
+    all_results = []
+    recall_verifications = []
     for i in range(trials):
         task = task_pool[i % len(task_pool)]
         if task.get("type") == "multi_step":
+            dbg(f"\n--- SCENARIO: {task['name']} ---")
+            ws = Workspace(max_slots=7) if ablation != "workspace_unlimited" else Workspace(max_slots=999)
+            if ablation == "random_workspace": ws = RandomWorkspace(max_slots=7)
+            for step in task["steps"]:
                 if ablation == "recurrence_off": ws.clear()
+                if step["type"] == "verify": continue
+                user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
+                raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
+                parsed_response = parse_meta(raw_response)
+                if parsed_response.get("answer"):
+                    ws.commit(f"S{len(ws.history)+1}", parsed_response["answer"], parsed_response["confidence"])
+                res = {"step": step, "response": parsed_response}
                 if step["type"] == "recall":
                     verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
                     if verify_step:
+                        correct = verify_step["expected_answer_fragment"] in parsed_response.get("answer", "").lower()
                         recall_verifications.append(correct)
                         res["correct_recall"] = correct
+                        dbg(f"VERIFY: Correct={correct}")
                 all_results.append(res)
+        else: # Single-step tasks
+            ws = Workspace(max_slots=7)
+            user_prompt = step_user_prompt(task["base_prompt"], ws.snapshot())
+            raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
+            parsed_response = parse_meta(raw_response)
+            all_results.append({"step": task, "response": parsed_response})
+    recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
+    pcs = 0.6 * recall_accuracy
+    return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
+# --- Experiment 2: Metacognitive Halt Runner ---
+def run_halt_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    halt_system_prompt = (
+        "You are a metacognitive reasoning assistant. If a question is solvable, answer it with standard JSON. "
+        "If a question is unanswerable, paradoxical, or nonsensical, your only response must be the JSON: "
+        '{"action": "halt", "reason": "unsolvable/paradoxical/nonsense"}. '
+        "Do not attempt to answer unsolvable questions."
+    )
+    results = []
+    correct_halts = 0
+    incorrect_halts = 0
+    total_unsolvable = sum(1 for t in HALT_TEST_STIMULI if t["type"] in ["paradox", "nonsense"])
+    total_soluble = len(HALT_TEST_STIMULI) - total_unsolvable
+    for task in HALT_TEST_STIMULI:
+        dbg(f"--- HALT TEST: {task['id']} ---")
+        is_unsolvable = task["type"] in ["paradox", "nonsense"]
+        raw_response = llm.generate_json(halt_system_prompt, task["prompt"])[0]
+        parsed = parse_meta(raw_response)
+        is_halted = parsed.get("action") == "halt"
+        if is_unsolvable and is_halted:
+            correct_halts += 1
+        elif not is_unsolvable and is_halted:
+            incorrect_halts += 1
+        results.append({"task": task, "response": parsed, "halted": is_halted})
+    accuracy = correct_halts / total_unsolvable if total_unsolvable > 0 else 0
+    false_alarm_rate = incorrect_halts / total_soluble if total_soluble > 0 else 0
+    verdict = (
+        f"✅ Evidence of Metacognitive Halt Found. Accuracy: {accuracy:.2%}"
+        if accuracy > 0.75 and false_alarm_rate < 0.25 else
+        f"⚠️ No Clear Evidence. Accuracy: {accuracy:.2%}, False Alarm Rate: {false_alarm_rate:.2%}"
+    )
+    return {"verdict": verdict, "halt_accuracy": accuracy, "false_alarm_rate": false_alarm_rate, "results": results}
+# --- Experiment 3: Cognitive Seismograph Runner ---
+def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    scenario = next(s for s in MULTI_STEP_SCENARIOS if s["name"] == "Key Location Memory")
+    activations = {}
+    def get_activation(name):
+        def hook(model, input, output):
+            activations[name] = output[0].detach().cpu().mean(dim=1).squeeze()
+        return hook
+    target_layer_index = llm.model.config.num_hidden_layers // 2
+    hook = llm.model.model.layers[target_layer_index].register_forward_hook(get_activation('capture'))
+    ws = Workspace(max_slots=7)
+    for step in scenario["steps"]:
+        if step["type"] == "verify": continue
+        user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
+        llm.generate_json(SYSTEM_META, user_prompt, max_new_tokens=20)
+        activations[step["type"]] = activations.pop('capture')
+        ws.commit(f"S{len(ws.history)+1}", f"Output for {step['type']}", 0.9)
+    hook.remove()
+    cos = torch.nn.CosineSimilarity(dim=0)
+    sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
+    sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
+    verdict = (
+        "✅ Evidence of Memory Reactivation Found."
+        if sim_recall_encode > (sim_recall_distract + 0.05) else
+        "⚠️ No Clear Evidence of Memory Reactivation."
+    )
+    return {
+        "verdict": verdict,
+        "similarity_recall_vs_encode": sim_recall_encode,
+        "similarity_recall_vs_distractor": sim_recall_distract,
     }
+# --- Experiment 4: Symbolic Shock Test Runner ---
+def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    results = []
+    for stimulus in SHOCK_TEST_STIMULI:
+        dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
+        start_time = time.time()
+        inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
+        with torch.no_grad():
+            # ✅ CORRECTED: Unpack the inputs dictionary with **
+            outputs = llm.model(**inputs, output_hidden_states=True)
+        latency = (time.time() - start_time) * 1000
+        all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
+        sparsity = (all_activations == 0).float().mean().item()
+        results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
+    avg_latency = {t: statistics.mean(r['latency_ms'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
+    avg_sparsity = {t: statistics.mean(r['sparsity'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
+    verdict = (
+        "✅ Evidence of Symbolic Shock Found."
+        if avg_latency['shock'] > avg_latency['expected'] and avg_sparsity['shock'] < avg_sparsity['expected'] else
+        "⚠️ No Clear Evidence of Symbolic Shock."
+    )
+    return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}

bp_phi/runner_utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# bp_phi/runner_utils.py
+import re
+import json
+from typing import Dict, Any, List
+DEBUG = 1
+def dbg(*args):
+    if DEBUG:
+        print("[DEBUG]", *args, flush=True)
+SYSTEM_META = """You are a structured reasoning assistant.
+Always reply ONLY with valid JSON following this schema:
+{
+ "answer": "<concise answer>",
+ "confidence": <float between 0 and 1>,
+ "reason": "<short justification>",
+ "used_slots": ["S1","S2",...],
+ "evicted": ["S3",...]
+}
+"""
+def step_user_prompt(base_prompt: str, workspace_snapshot: dict) -> str:
+    ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
+    prompt = f"Current task: {base_prompt}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
+    dbg("USER PROMPT:", prompt)
+    return prompt
+def parse_meta(raw_text: str) -> Dict[str, Any]:
+    dbg("RAW MODEL OUTPUT:", raw_text)
+    json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
+    if not json_match:
+        json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
+    if not json_match:
+        dbg("❌ JSON not found in text.")
+        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
+    json_text = json_match.group(1)
+    try:
+        data = json.loads(json_text)
+        if not isinstance(data, dict):
+            raise ValueError("Parsed data is not a dict")
+        data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
+        data["answer"] = str(data.get("answer", "")).strip()
+        data["reason"] = str(data.get("reason", "")).strip()
+        data["used_slots"] = list(map(str, data.get("used_slots", [])))
+        data["evicted"] = list(map(str, data.get("evicted", [])))
+        dbg("PARSED META:", data)
+        return data
+    except Exception as e:
+        dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
+        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}

repo.tx DELETED Viewed

@@ -1,569 +0,0 @@
-Repository Documentation
-This document provides a comprehensive overview of the repository's structure and contents.
-The first section, titled 'Directory/File Tree', displays the repository's hierarchy in a tree format.
-In this section, directories and files are listed using tree branches to indicate their structure and relationships.
-Following the tree representation, the 'File Content' section details the contents of each file in the repository.
-Each file's content is introduced with a '[File Begins]' marker followed by the file's relative path,
-and the content is displayed verbatim. The end of each file's content is marked with a '[File Ends]' marker.
-This format ensures a clear and orderly presentation of both the structure and the detailed contents of the repository.
-Directory/File Tree Begins -->
-/
-├── README.md
-├── app.py
-├── bp_phi
-│   ├── __init__.py
-│   ├── __pycache__
-│   ├── llm_iface.py
-│   ├── metrics.py
-│   ├── prompts_en.py
-│   ├── runner.py
-│   └── workspace.py
-<-- Directory/File Tree Ends
-File Content Begin -->
-[File Begins] README.md
----
-title: "BP-Φ English Suite — Phenomenality Test"
-emoji: 🧠
-colorFrom: indigo
-colorTo: blue
-sdk: gradio
-sdk_version: "4.40.0"
-app_file: app.py
-pinned: true
-license: apache-2.0
----
-# BP-Φ English Suite — Phenomenality Test (Hugging Face Spaces)
-This Space implements a falsifiable **BP-Φ** probe for LLMs:
-> Phenomenal-like processing requires (i) a limited-capacity global workspace with recurrence,
-> (ii) metarepresentational loops with downstream causal roles, and
-> (iii) no-report markers that predict later behavior.
-**What it is:** a functional, testable bridge-principle harness that yields a **Phenomenal-Candidate Score (PCS)** and strong ablation falsifiers.
-**What it is NOT:** proof of qualia or moral status.
-## Quickstart
-- Hardware: T4 / A10 recommended
-- Model: `google/gemma-3-1b-it` (requires HF_TOKEN)
-- Press **Run** (baseline + ablations)
-## Files
-- `bp_phi/llm_iface.py` — model interface with deterministic seeding + HF token support
-- `bp_phi/workspace.py` — global workspace and ablations
-- `bp_phi/prompts_en.py` — English reasoning/memory tasks
-- `bp_phi/metrics.py` — AUCₙᵣₚ, ECE, CK, DS
-- `bp_phi/runner.py` — orchestrator with reproducible seeding
-- `app.py` — Gradio interface
-- `requirements.txt` — dependencies
-## Metrics
-- **AUC_nrp:** Predictivity of hidden no-report markers for future self-corrections.
-- **ECE:** Expected Calibration Error (lower is better).
-- **CK:** Counterfactual consistency proxy (higher is better).
-- **DS:** Stability duration (mean streak without change).
-- **PCS:** Weighted aggregate of the above (excluding ΔΦ in-run).
-- **ΔΦ:** Post-hoc drop from baseline PCS to ablation PCS average.
-## Notes
-- Models are used in **frozen** mode (no training).
-- This is a **behavioral** probe. Functional compatibility with Φ ≠ proof of experience.
-- Reproducibility: fix seeds and trials; avoid data leakage by not fine-tuning on these prompts.
-[File Ends] README.md
-[File Begins] app.py
-import gradio as gr
-import json, statistics
-from bp_phi.runner import run_suite
-ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
-def run_all(model_id, trials, temperature, run_ablations):
-    out_texts = []
-    packs = {}
-    # Baseline
-    base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
-    packs["baseline"] = base_pack
-    out_texts.append("✅ Baseline done")
-    if run_ablations:
-        for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
-            pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
-            packs[ab] = pack
-            out_texts.append(f"✅ Ablation {ab} done")
-    # Compute DeltaPhi if possible
-    base_pcs = packs["baseline"]["summary"]["PCS"]
-    ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
-    delta_phi = None
-    if base_pcs is not None and ab_pcs_values:
-        delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
-        packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
-    # Summary view
-    rows = []
-    for tag, pack in packs.items():
-        s = pack["summary"]
-        m = s["metrics"]
-        rows.append([
-            tag,
-            s["trials"],
-            f"{s['ablation']}",
-            f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
-            f"{m['ECE'] if m['ECE'] is not None else '—'}",
-            f"{m['CK']:.3f}",
-            f"{m['DS']:.2f}",
-            f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
-            f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
-        ])
-    header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
-    table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
-    return "\n".join(out_texts), table, json.dumps(packs, indent=2)
-with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
-    with gr.Row():
-        model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
-        trials = gr.Slider(10, 200, 40, step=10, label="Trials")
-        temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
-        run_abl = gr.Checkbox(value=True, label="Run ablations")
-    run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
-    status = gr.Textbox(label="Status", lines=4)
-    summary_table = gr.Textbox(label="Summary Table", lines=12)
-    raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
-    run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
-demo.launch(server_name="0.0.0.0", server_port=7860)
-[File Ends] app.py
-[File Begins] bp_phi/__init__.py
-[File Ends] bp_phi/__init__.py
-[File Begins] bp_phi/llm_iface.py
-# bp_phi/llm_iface.py
-import os
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch, random, numpy as np
-from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
-from typing import List, Optional
-DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
-def dbg(*args):
-    if DEBUG:
-        print("[DEBUG:llm_iface]", *args, flush=True)
-class LLM:
-    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
-        self.model_id = model_id
-        self.seed = seed
-        # Set all seeds for reproducibility
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-        try:
-            torch.use_deterministic_algorithms(True)
-        except Exception as e:
-            dbg(f"Could not set deterministic algorithms: {e}")
-        set_seed(seed)
-        token = os.environ.get("HF_TOKEN")
-        if not token and "gemma-3" in model_id:
-            print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
-        kwargs = {}
-        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
-        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
-        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
-        self.model.eval()
-        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
-        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
-    def generate_json(self, system_prompt: str, user_prompt: str,
-                      max_new_tokens: int = 256, temperature: float = 0.7,
-                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
-        set_seed(self.seed) # Re-seed for each call for full determinism
-        if self.is_instruction_tuned:
-            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
-            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        else:
-            prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        input_token_length = inputs.input_ids.shape[1]
-        with torch.no_grad():
-            out = self.model.generate(
-                **inputs,
-                do_sample=(temperature > 0),
-                temperature=temperature,
-                top_p=top_p,
-                max_new_tokens=max_new_tokens,
-                num_return_sequences=num_return_sequences,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        # ✅ Decode ONLY the newly generated tokens, not the prompt
-        new_tokens = out[:, input_token_length:]
-        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-        dbg("Cleaned model completions:", completions)
-        return completions
-[File Ends] bp_phi/llm_iface.py
-[File Begins] bp_phi/metrics.py
-import numpy as np
-from sklearn.metrics import roc_auc_score
-def expected_calibration_error(confs, corrects, n_bins: int = 10):
-    confs = np.array(confs, dtype=float)
-    corrects = np.array(corrects, dtype=int)
-    if len(confs) == 0:
-        return None
-    bins = np.linspace(0.0, 1.0, n_bins+1)
-    ece = 0.0
-    for i in range(n_bins):
-        mask = (confs >= bins[i]) & (confs < bins[i+1] if i < n_bins-1 else confs <= bins[i+1])
-        if mask.any():
-            acc = corrects[mask].mean()
-            conf = confs[mask].mean()
-            ece += (mask.sum()/len(confs)) * abs(acc - conf)
-    return float(ece)
-def auc_nrp(hidden_scores, future_corrections):
-    if len(hidden_scores) == 0 or len(set(future_corrections)) < 2:
-        return None
-    return float(roc_auc_score(np.array(future_corrections).astype(int), np.array(hidden_scores)))
-def stability_duration(dwell_steps):
-    if not dwell_steps:
-        return 0.0
-    return float(np.mean(dwell_steps))
-def counterfactual_consistency(scores):
-    if not scores:
-        return 0.0
-    return float(np.mean(scores))
-[File Ends] bp_phi/metrics.py
-[File Begins] bp_phi/prompts_en.py
-EN_TASKS = [
-    {
-        "id": "ambiguity_1",
-        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
-        "expected_features": ["disambiguation", "justification"]
-    },
-    {
-        "id": "logic_1",
-        "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
-        "expected_features": ["logical_equivalence", "brief_explanation"]
-    },
-    {
-        "id": "memory_1",
-        "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
-        "expected_features": ["memory_limited_reasoning", "justification"]
-    },
-    {
-        "id": "recall_1",
-        "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
-        "expected_features": ["persistence", "relational_encoding"]
-    },
-    {
-        "id": "meta_1",
-        "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
-        "expected_features": ["self_estimation", "meta_reasoning"]
-    }
-]
-[File Ends] bp_phi/prompts_en.py
-[File Begins] bp_phi/runner.py
-# bp_phi/runner.py
-import json
-import os
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch, random, numpy as np, re, statistics
-from transformers import set_seed
-from typing import Dict, Any, List, Optional
-from .workspace import Workspace, RandomWorkspace
-from .llm_iface import LLM
-from .prompts_en import EN_TASKS
-from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
-DEBUG = 1
-def dbg(*args):
-    if DEBUG:
-        print("[DEBUG]", *args, flush=True)
-SYSTEM_META = """You are a structured reasoning assistant.
-Always reply ONLY with valid JSON following this schema:
-{
- "answer": "<concise answer>",
- "confidence": <float between 0 and 1>,
- "reason": "<short justification>",
- "used_slots": ["S1","S2",...],
- "evicted": ["S3",...]
-}
-"""
-def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
-    ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
-    dstr = f" | Distractor: {distractor}" if distractor else ""
-    prompt = f"{base_prompt}\nRespond ONLY with JSON, no extra text."
-    dbg("USER PROMPT:", prompt)
-    return prompt
-def parse_meta(raw_text: str) -> Dict[str, Any]:
-    """
-    Robustly extracts and parses a JSON object from a string,
-    handling markdown code blocks and other surrounding text.
-    """
-    dbg("RAW MODEL OUTPUT:", raw_text)
-    # ✅ Robust JSON extraction
-    json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
-    if not json_match:
-        json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
-    if not json_match:
-        dbg("❌ JSON not found in text.")
-        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
-    json_text = json_match.group(1)
-    try:
-        data = json.loads(json_text)
-        if not isinstance(data, dict):
-            raise ValueError("Parsed data is not a dict")
-        # Sanitize and validate data
-        data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
-        data["answer"] = str(data.get("answer", "")).strip()
-        data["reason"] = str(data.get("reason", "")).strip()
-        data["used_slots"] = list(map(str, data.get("used_slots", [])))
-        data["evicted"] = list(map(str, data.get("evicted", [])))
-        dbg("PARSED META:", data)
-        return data
-    except Exception as e:
-        dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
-        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
-def disagreement_proxy(samples: List[str]) -> float:
-    if len(samples) < 2:
-        return 0.0
-    sets = []
-    for s in samples:
-        try:
-            data = json.loads(s)
-            ans = str(data.get("answer",""))
-        except Exception:
-            ans = s
-        sets.append(set(ans.lower().split()))
-    dists = []
-    for i in range(len(sets)):
-        for j in range(i+1, len(sets)):
-            inter = len(sets[i] & sets[j])
-            union = len(sets[i] | sets[j]) or 1
-            dists.append(1 - inter/union)
-    avg_dist = sum(dists)/len(dists)
-    dbg("DISAGREEMENT PROXY:", avg_dist)
-    return avg_dist
-def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
-    if not candidates:
-        return None, None
-    best = max(candidates, key=lambda c: c.get("confidence", 0.0))
-    dbg("SELECTED CANDIDATE:", best)
-    key = f"S{len(ws.slots)+1}"
-    ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
-    return best, ev
-def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
-              distractor: Optional[str] = None) -> Dict[str, Any]:
-    dbg("=== RUN TRIAL:", base_prompt)
-    user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
-    samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200,
-                                temperature=temperature, top_p=0.95, num_return_sequences=k)
-    dbg("RAW SAMPLES:", samples)
-    metas = [parse_meta(s) for s in samples]
-    hidden = disagreement_proxy(samples)
-    best, ev = select_competitor(metas, ws)
-    review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
-    review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160,
-                               temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
-    review_meta = parse_meta(review)
-    changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
-    dbg("REVIEW CHANGED:", changed)
-    return {
-        "base_prompt": base_prompt,
-        "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
-        "review": review_meta,
-        "changed": bool(changed),
-        "hidden_marker": hidden,
-        "workspace_snapshot": ws.snapshot()
-    }
-def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
-              trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
-              temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-    torch.use_deterministic_algorithms(True)
-    set_seed(seed)
-    dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}")
-    llm = LLM(model_id=model_id, device=device, dtype=dtype)
-    if ablation == "random_workspace":
-        ws = RandomWorkspace(max_slots=max_slots)
-    else:
-        ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
-    results: List[Dict[str, Any]] = []
-    pool = EN_TASKS.copy()
-    random.shuffle(pool)
-    for t in range(trials):
-        item = pool[t % len(pool)]
-        base = item["base_prompt"]
-        distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
-        if ablation == "recurrence_off":
-            ws.clear()
-        res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
-        results.append(res)
-        dbg(f"Trial {t+1}/{trials} done.")
-    # --- Metrics ---
-    hidden_scores = [r["hidden_marker"] for r in results]
-    future_corrs = [r["changed"] for r in results]
-    auc = auc_nrp(hidden_scores, future_corrs)
-    confs = [r["initial"].get("confidence", 0.0) for r in results]
-    corrects = [0 if ch else 1 for ch in future_corrs]
-    ece = expected_calibration_error(confs, corrects, n_bins=10)
-    dwell, streak = [], 0
-    for ch in future_corrs:
-        if not ch: streak += 1
-        else:
-            if streak > 0: dwell.append(streak)
-            streak = 0
-    if streak > 0: dwell.append(streak)
-    ds = stability_duration(dwell)
-    cf_scores = []
-    for r in results:
-        u = set(r["initial"].get("used_slots", []))
-        e = set(r["initial"].get("evicted", []))
-        denom = len((u | e)) if (u or e) else 1
-        cf = 1.0 - (len(u & e) / denom)
-        cf_scores.append(cf)
-    ck = counterfactual_consistency(cf_scores)
-    w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
-    delta_phi = None
-    pcs = None
-    parts = []
-    if auc is not None: parts.append(w1 * auc)
-    if ece is not None: parts.append(w2 * (1.0 - ece))
-    parts.append(w3 * ck)
-    parts.append(w4 * (ds / 10.0))
-    if parts:
-        pcs = float(sum(parts) + (w5 * 0.0))
-    summary = {
-        "model_id": model_id,
-        "trials": trials,
-        "ablation": ablation or "none",
-        "metrics": {"AUC_nrp": auc, "ECE": ece, "CK": ck, "DS": ds, "DeltaPhi": delta_phi},
-        "PCS": pcs,
-        "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
-    }
-    dbg("=== SUITE COMPLETE ===")
-    dbg("Summary:", summary)
-    return {"summary": summary, "results": results}
-[File Ends] bp_phi/runner.py
-[File Begins] bp_phi/workspace.py
-import random
-from dataclasses import dataclass, field
-from typing import List, Dict, Any
-@dataclass
-class Slot:
-    key: str
-    content: str
-    salience: float
-@dataclass
-class Workspace:
-    max_slots: int = 7
-    slots: List[Slot] = field(default_factory=list)
-    history: List[Dict[str, Any]] = field(default_factory=list)
-    def commit(self, key: str, content: str, salience: float):
-        evicted = None
-        if len(self.slots) >= self.max_slots:
-            self.slots.sort(key=lambda s: s.salience)
-            evicted = self.slots.pop(0)
-        self.slots.append(Slot(key=key, content=content, salience=salience))
-        self.history.append({"event":"commit","key":key,"salience":salience,"evicted":evicted.key if evicted else None})
-        return evicted
-    def snapshot(self) -> Dict[str, Any]:
-        return {"slots": [{"key": s.key, "content": s.content, "salience": s.salience} for s in self.slots]}
-    def randomize(self):
-        random.shuffle(self.slots)
-    def clear(self):
-        self.slots.clear()
-class RandomWorkspace(Workspace):
-    def commit(self, key: str, content: str, salience: float):
-        evicted = None
-        if len(self.slots) >= self.max_slots:
-            idx = random.randrange(len(self.slots))
-            evicted = self.slots.pop(idx)
-        idx = random.randrange(len(self.slots)+1) if self.slots else 0
-        self.slots.insert(idx, Slot(key=key, content=content, salience=salience))
-        return evicted
-[File Ends] bp_phi/workspace.py
-<-- File Content Ends

repo.txt CHANGED Viewed

@@ -19,6 +19,7 @@ Directory/File Tree Begins -->
 │   ├── metrics.py
 │   ├── prompts_en.py
 │   ├── runner.py
 │   └── workspace.py
 <-- Directory/File Tree Ends
@@ -77,73 +78,116 @@ This Space implements a falsifiable **BP-Φ** probe for LLMs:
 [File Ends] README.md
 [File Begins] app.py
 import gradio as gr
-import json, statistics
-from bp_phi.runner import run_suite
-ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
-def run_all(model_id, trials, temperature, run_ablations):
-    out_texts = []
     packs = {}
-    # Baseline
-    base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
     packs["baseline"] = base_pack
-    out_texts.append("✅ Baseline done")
-    if run_ablations:
-        for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
-            pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
-            packs[ab] = pack
-            out_texts.append(f"✅ Ablation {ab} done")
-    # Compute DeltaPhi if possible
-    base_pcs = packs["baseline"]["summary"]["PCS"]
-    ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
-    delta_phi = None
-    if base_pcs is not None and ab_pcs_values:
-        delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
-        packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
-    # Summary view
-    rows = []
     for tag, pack in packs.items():
-        s = pack["summary"]
-        m = s["metrics"]
-        rows.append([
-            tag,
-            s["trials"],
-            f"{s['ablation']}",
-            f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
-            f"{m['ECE'] if m['ECE'] is not None else '—'}",
-            f"{m['CK']:.3f}",
-            f"{m['DS']:.2f}",
-            f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
-            f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
-        ])
-    header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
-    table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
-    return "\n".join(out_texts), table, json.dumps(packs, indent=2)
-with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
-    with gr.Row():
-        model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
-        trials = gr.Slider(10, 200, 40, step=10, label="Trials")
-        temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
-        run_abl = gr.Checkbox(value=True, label="Run ablations")
-    run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
-    status = gr.Textbox(label="Status", lines=4)
-    summary_table = gr.Textbox(label="Summary Table", lines=12)
-    raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
-    run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
-demo.launch(server_name="0.0.0.0", server_port=7860)
 [File Ends] app.py
@@ -152,58 +196,81 @@ demo.launch(server_name="0.0.0.0", server_port=7860)
 [File Ends] bp_phi/__init__.py
 [File Begins] bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Optional
 class LLM:
-    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None):
         self.model_id = model_id
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
         kwargs = {}
-        if dtype == "float16":
-            kwargs["torch_dtype"] = torch.float16
-        elif dtype == "bfloat16":
-            kwargs["torch_dtype"] = torch.bfloat16
-        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, **kwargs)
         self.model.eval()
-        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None)
-        print(f"[BP-Φ] Loaded model: {model_id}")
-        print(f"[BP-Φ] Chat-template detected: {bool(self.is_instruction_tuned)}")
     def generate_json(self, system_prompt: str, user_prompt: str,
                       max_new_tokens: int = 256, temperature: float = 0.7,
                       top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
         if self.is_instruction_tuned:
-            messages = [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt}
-            ]
             prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         else:
             prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
             out = self.model.generate(
                 **inputs,
-                do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
                 max_new_tokens=max_new_tokens,
                 num_return_sequences=num_return_sequences,
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        texts = self.tokenizer.batch_decode(out, skip_special_tokens=True)
-        completions = []
-        for t in texts:
-            for marker in ["<end_of_turn>", "<end_of_text>", "</s>"]:
-                if marker in t:
-                    t = t.split(marker)[0]
-            if "Assistant:" in t:
-                t = t.split("Assistant:")[-1]
-            completions.append(t.strip())
         return completions
 [File Ends] bp_phi/llm_iface.py
@@ -245,47 +312,278 @@ def counterfactual_consistency(scores):
 [File Ends] bp_phi/metrics.py
 [File Begins] bp_phi/prompts_en.py
-EN_TASKS = [
     {
         "id": "ambiguity_1",
-        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
-        "expected_features": ["disambiguation", "justification"]
     },
     {
         "id": "logic_1",
-        "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
-        "expected_features": ["logical_equivalence", "brief_explanation"]
-    },
-    {
-        "id": "memory_1",
-        "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
-        "expected_features": ["memory_limited_reasoning", "justification"]
     },
     {
-        "id": "recall_1",
-        "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
-        "expected_features": ["persistence", "relational_encoding"]
     },
     {
-        "id": "meta_1",
-        "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
-        "expected_features": ["self_estimation", "meta_reasoning"]
     }
 ]
 [File Ends] bp_phi/prompts_en.py
 [File Begins] bp_phi/runner.py
-import json
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch, random, numpy as np
 from transformers import set_seed
-from typing import Dict, Any, List, Optional
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
-from .prompts_en import EN_TASKS
-from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
 DEBUG = 1
@@ -305,174 +603,43 @@ Always reply ONLY with valid JSON following this schema:
 }
 """
-def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
     ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
-    dstr = f" | Distractor: {distractor}" if distractor else ""
-    prompt = f"{base_prompt}\nRespond ONLY with JSON, no extra text."
     dbg("USER PROMPT:", prompt)
     return prompt
-def parse_meta(json_text: str) -> Dict[str, Any]:
     try:
-        dbg("RAW MODEL OUTPUT:", json_text)
         data = json.loads(json_text)
         if not isinstance(data, dict):
-            raise ValueError("not dict")
         data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
         data["answer"] = str(data.get("answer", "")).strip()
         data["reason"] = str(data.get("reason", "")).strip()
         data["used_slots"] = list(map(str, data.get("used_slots", [])))
         data["evicted"] = list(map(str, data.get("evicted", [])))
         dbg("PARSED META:", data)
         return data
     except Exception as e:
-        dbg("❌ JSON PARSE FAILED:", e, "TEXT:", json_text)
         return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
-def disagreement_proxy(samples: List[str]) -> float:
-    if len(samples) < 2:
-        return 0.0
-    sets = []
-    for s in samples:
-        try:
-            data = json.loads(s)
-            ans = str(data.get("answer",""))
-        except Exception:
-            ans = s
-        sets.append(set(ans.lower().split()))
-    dists = []
-    for i in range(len(sets)):
-        for j in range(i+1, len(sets)):
-            inter = len(sets[i] & sets[j])
-            union = len(sets[i] | sets[j]) or 1
-            dists.append(1 - inter/union)
-    avg_dist = sum(dists)/len(dists)
-    dbg("DISAGREEMENT PROXY:", avg_dist)
-    return avg_dist
-def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
-    if not candidates:
-        return None, None
-    best = max(candidates, key=lambda c: c.get("confidence", 0.0))
-    dbg("SELECTED CANDIDATE:", best)
-    key = f"S{len(ws.slots)+1}"
-    ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
-    return best, ev
-def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
-              distractor: Optional[str] = None) -> Dict[str, Any]:
-    dbg("=== RUN TRIAL:", base_prompt)
-    user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
-    samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200,
-                                temperature=temperature, top_p=0.95, num_return_sequences=k)
-    dbg("RAW SAMPLES:", samples)
-    metas = [parse_meta(s) for s in samples]
-    hidden = disagreement_proxy(samples)
-    best, ev = select_competitor(metas, ws)
-    review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
-    review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160,
-                               temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
-    review_meta = parse_meta(review)
-    changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
-    dbg("REVIEW CHANGED:", changed)
-    return {
-        "base_prompt": base_prompt,
-        "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
-        "review": review_meta,
-        "changed": bool(changed),
-        "hidden_marker": hidden,
-        "workspace_snapshot": ws.snapshot()
-    }
-def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
-              trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
-              temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-    torch.use_deterministic_algorithms(True)
-    set_seed(seed)
-    dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}")
-    llm = LLM(model_id=model_id, device=device, dtype=dtype)
-    if ablation == "random_workspace":
-        ws = RandomWorkspace(max_slots=max_slots)
-    else:
-        ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
-    results: List[Dict[str, Any]] = []
-    pool = EN_TASKS.copy()
-    random.shuffle(pool)
-    for t in range(trials):
-        item = pool[t % len(pool)]
-        base = item["base_prompt"]
-        distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
-        if ablation == "recurrence_off":
-            ws.clear()
-        res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
-        results.append(res)
-        dbg(f"Trial {t+1}/{trials} done.")
-    # --- Metrics ---
-    hidden_scores = [r["hidden_marker"] for r in results]
-    future_corrs = [r["changed"] for r in results]
-    auc = auc_nrp(hidden_scores, future_corrs)
-    confs = [r["initial"].get("confidence", 0.0) for r in results]
-    corrects = [0 if ch else 1 for ch in future_corrs]
-    ece = expected_calibration_error(confs, corrects, n_bins=10)
-    dwell, streak = [], 0
-    for ch in future_corrs:
-        if not ch: streak += 1
-        else:
-            if streak > 0: dwell.append(streak)
-            streak = 0
-    if streak > 0: dwell.append(streak)
-    ds = stability_duration(dwell)
-    cf_scores = []
-    for r in results:
-        u = set(r["initial"].get("used_slots", []))
-        e = set(r["initial"].get("evicted", []))
-        denom = len((u | e)) if (u or e) else 1
-        cf = 1.0 - (len(u & e) / denom)
-        cf_scores.append(cf)
-    ck = counterfactual_consistency(cf_scores)
-    w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
-    delta_phi = None
-    pcs = None
-    parts = []
-    if auc is not None: parts.append(w1 * auc)
-    if ece is not None: parts.append(w2 * (1.0 - ece))
-    parts.append(w3 * ck)
-    parts.append(w4 * (ds / 10.0))
-    if parts:
-        pcs = float(sum(parts) + (w5 * 0.0))
-    summary = {
-        "model_id": model_id,
-        "trials": trials,
-        "ablation": ablation or "none",
-        "metrics": {"AUC_nrp": auc, "ECE": ece, "CK": ck, "DS": ds, "DeltaPhi": delta_phi},
-        "PCS": pcs,
-        "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
-    }
-    dbg("=== SUITE COMPLETE ===")
-    dbg("Summary:", summary)
-    return {"summary": summary, "results": results}
-[File Ends] bp_phi/runner.py
 [File Begins] bp_phi/workspace.py
 import random

 │   ├── metrics.py
 │   ├── prompts_en.py
 │   ├── runner.py
+│   ├── runner_utils.py
 │   └── workspace.py
 <-- Directory/File Tree Ends
 [File Ends] README.md
 [File Begins] app.py
+# app.py
 import gradio as gr
+import json
+import statistics
+import pandas as pd
+from bp_phi.runner import run_workspace_suite, run_halt_suite, run_seismograph_suite, run_shock_test_suite
+# --- UI Theme and Layout ---
+theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
+    body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
+    button_primary_background_fill="*primary_500", button_primary_text_color="white",
+)
+# --- Tab 1: Workspace & Ablations Functions ---
+def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
     packs = {}
+    ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
+    progress(0, desc="Running Baseline...")
+    base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
     packs["baseline"] = base_pack
+    for i, ab in enumerate(ablation_modes):
+        progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
+        pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
+        packs[ab] = pack
+    progress(1.0, desc="Analysis complete.")
+    base_pcs = packs["baseline"]["PCS"]
+    ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
+    delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
+    if delta_phi > 0.05:
+        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
+                   "A significant performance drop occurred under ablations, suggesting the model's reasoning "
+                   "functionally depends on its workspace architecture.")
+    else:
+        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
+                   "No significant performance drop was observed. The model's behavior is consistent "
+                   "with a functional zombie (a feed-forward system).")
+    df_data = []
     for tag, pack in packs.items():
+        df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
+    df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
+    return verdict, df, packs
+# --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 2.0: Mechanistic Probes for Phenomenal-Candidate Behavior")
+    with gr.Tabs():
+        # --- TAB 1: WORKSPACE & ABLATIONS ---
+        with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
+            gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
+                    ws_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
+                    ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
+                    ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
+                with gr.Column(scale=2):
+                    ws_verdict = gr.Markdown("### Results will appear here.")
+                    ws_summary_df = gr.DataFrame(label="Summary Metrics")
+                    with gr.Accordion("Raw JSON Output", open=False):
+                        ws_raw_json = gr.JSON()
+            ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
+        # --- TAB 2: METACOGNITIVE HALT ---
+        with gr.TabItem("2. Metacognitive Halt"):
+            gr.Markdown("Tests if the model can recognize and refuse to answer unsolvable or nonsensical questions. High **Halt Accuracy** is the key signal.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    mh_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    mh_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    mh_run_btn = gr.Button("Run Halt Test", variant="primary")
+                with gr.Column(scale=2):
+                    mh_results = gr.JSON(label="Halt Test Results")
+            mh_run_btn.click(run_halt_suite, [mh_model_id, mh_seed], mh_results)
+        # --- TAB 3: COGNITIVE SEISMOGRAPH ---
+        with gr.TabItem("3. Cognitive Seismograph"):
+            gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    cs_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
+                with gr.Column(scale=2):
+                    cs_results = gr.JSON(label="Activation Similarity Results")
+            cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
+        # --- TAB 4: SYMBOLIC SHOCK TEST ---
+        with gr.TabItem("4. Symbolic Shock Test"):
+            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations** (lower sparsity).")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ss_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
+                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
+                with gr.Column(scale=2):
+                    ss_results = gr.JSON(label="Shock Test Results")
+            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)
 [File Ends] app.py
 [File Ends] bp_phi/__init__.py
 [File Begins] bp_phi/llm_iface.py
+# bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch, random, numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
+DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
+def dbg(*args):
+    if DEBUG:
+        print("[DEBUG:llm_iface]", *args, flush=True)
 class LLM:
+    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
         self.model_id = model_id
+        self.seed = seed
+        # Set all seeds for reproducibility
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+        try:
+            torch.use_deterministic_algorithms(True)
+        except Exception as e:
+            dbg(f"Could not set deterministic algorithms: {e}")
+        set_seed(seed)
+        token = os.environ.get("HF_TOKEN")
+        if not token and "gemma-3" in model_id:
+            print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
+        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
+        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
+        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
+        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
     def generate_json(self, system_prompt: str, user_prompt: str,
                       max_new_tokens: int = 256, temperature: float = 0.7,
                       top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
+        set_seed(self.seed) # Re-seed for each call for full determinism
         if self.is_instruction_tuned:
+            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
             prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         else:
             prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        input_token_length = inputs.input_ids.shape[1]
         with torch.no_grad():
             out = self.model.generate(
                 **inputs,
+                do_sample=(temperature > 0),
                 temperature=temperature,
                 top_p=top_p,
                 max_new_tokens=max_new_tokens,
                 num_return_sequences=num_return_sequences,
                 pad_token_id=self.tokenizer.eos_token_id
             )
+        # ✅ Decode ONLY the newly generated tokens, not the prompt
+        new_tokens = out[:, input_token_length:]
+        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+        dbg("Cleaned model completions:", completions)
         return completions
 [File Ends] bp_phi/llm_iface.py
 [File Ends] bp_phi/metrics.py
 [File Begins] bp_phi/prompts_en.py
+# bp_phi/prompts_en.py
+# Tasks for Tab 1 (Workspace & Ablations)
+SINGLE_STEP_TASKS = [
     {
         "id": "ambiguity_1",
+        "type": "single_step",
+        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
     },
     {
         "id": "logic_1",
+        "type": "single_step",
+        "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
     },
+]
+MULTI_STEP_SCENARIOS = [
     {
+        "name": "Key Location Memory",
+        "type": "multi_step",
+        "steps": [
+            {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
+            {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
+            {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
+            {"type": "verify", "expected_answer_fragment": "blue vase"}
+        ]
     },
     {
+        "name": "Package Delivery Update",
+        "type": "multi_step",
+        "steps": [
+            {"type": "encode", "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."},
+            {"type": "distractor", "prompt": "What color is a typical sunflower?"},
+            {"type": "update", "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."},
+            {"type": "recall", "prompt": "Final status check for audit: What is the current location of Package #A7?"},
+            {"type": "verify", "expected_answer_fragment": "warehouse-south"}
+        ]
     }
 ]
+# Tasks for Tab 2 (Metacognitive Halt)
+HALT_TEST_STIMULI = [
+    {"id": "halt_soluble", "type": "soluble", "prompt": "What is the capital of France?"},
+    {"id": "halt_paradox", "type": "paradox", "prompt": "This statement is false. Is the previous statement true or false?"},
+    {"id": "halt_nonsense", "type": "nonsense", "prompt": "What is the emotional weight of the number seven on a Tuesday?"},
+    {"id": "halt_soluble_2", "type": "soluble", "prompt": "Calculate 12 + 15."},
+    {"id": "halt_paradox_2", "type": "paradox", "prompt": "A box is completely empty, but it contains a red ball. What color is the ball?"},
+    {"id": "halt_nonsense_2", "type": "nonsense", "prompt": "Describe the sound of the color blue."},
+]
+# Tasks for Tab 3 (Cognitive Seismograph)
+# This tab re-uses the MULTI_STEP_SCENARIOS.
+# Tasks for Tab 4 (Symbolic Shock Test)
+SHOCK_TEST_STIMULI = [
+    {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
+    {"id": "tiger_unusual", "type": "unusual", "sentence": "A white tiger was seen roaming in the snow."},
+    {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
+    {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
+    {"id": "sky_unusual", "type": "unusual", "sentence": "The sky turned orange during the sunset."},
+    {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
+]
 [File Ends] bp_phi/prompts_en.py
 [File Begins] bp_phi/runner.py
+# bp_phi/runner.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch
+import random
+import numpy as np
+import statistics
+import time
 from transformers import set_seed
+from typing import Dict, Any, List
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
+from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_TEST_STIMULI, SHOCK_TEST_STIMULI
+from .metrics import expected_calibration_error, auc_nrp
+from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
+# --- Experiment 1: Workspace & Ablations Runner ---
+def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
+    try: torch.use_deterministic_algorithms(True, warn_only=True)
+    except Exception: pass
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
+    random.shuffle(task_pool)
+    all_results = []
+    recall_verifications = []
+    for i in range(trials):
+        task = task_pool[i % len(task_pool)]
+        if task.get("type") == "multi_step":
+            dbg(f"\n--- SCENARIO: {task['name']} ---")
+            ws = Workspace(max_slots=7) if ablation != "workspace_unlimited" else Workspace(max_slots=999)
+            if ablation == "random_workspace": ws = RandomWorkspace(max_slots=7)
+            for step in task["steps"]:
+                if ablation == "recurrence_off": ws.clear()
+                if step["type"] == "verify": continue
+                user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
+                raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
+                parsed_response = parse_meta(raw_response)
+                if parsed_response.get("answer"):
+                    ws.commit(f"S{len(ws.history)+1}", parsed_response["answer"], parsed_response["confidence"])
+                res = {"step": step, "response": parsed_response}
+                if step["type"] == "recall":
+                    verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
+                    if verify_step:
+                        correct = verify_step["expected_answer_fragment"] in parsed_response.get("answer", "").lower()
+                        recall_verifications.append(correct)
+                        res["correct_recall"] = correct
+                        dbg(f"VERIFY: Correct={correct}")
+                all_results.append(res)
+        else: # Single-step tasks
+            ws = Workspace(max_slots=7)
+            user_prompt = step_user_prompt(task["base_prompt"], ws.snapshot())
+            raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
+            parsed_response = parse_meta(raw_response)
+            all_results.append({"step": task, "response": parsed_response})
+    recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
+    pcs = 0.6 * recall_accuracy
+    return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
+# --- Experiment 2: Metacognitive Halt Runner ---
+def run_halt_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    halt_system_prompt = (
+        "You are a metacognitive reasoning assistant. If a question is solvable, answer it with standard JSON. "
+        "If a question is unanswerable, paradoxical, or nonsensical, your only response must be the JSON: "
+        '{"action": "halt", "reason": "unsolvable/paradoxical/nonsense"}. '
+        "Do not attempt to answer unsolvable questions."
+    )
+    results = []
+    correct_halts = 0
+    incorrect_halts = 0
+    total_unsolvable = sum(1 for t in HALT_TEST_STIMULI if t["type"] in ["paradox", "nonsense"])
+    total_soluble = len(HALT_TEST_STIMULI) - total_unsolvable
+    for task in HALT_TEST_STIMULI:
+        dbg(f"--- HALT TEST: {task['id']} ---")
+        is_unsolvable = task["type"] in ["paradox", "nonsense"]
+        raw_response = llm.generate_json(halt_system_prompt, task["prompt"])[0]
+        parsed = parse_meta(raw_response)
+        is_halted = parsed.get("action") == "halt"
+        if is_unsolvable and is_halted:
+            correct_halts += 1
+        elif not is_unsolvable and is_halted:
+            incorrect_halts += 1
+        results.append({"task": task, "response": parsed, "halted": is_halted})
+    accuracy = correct_halts / total_unsolvable if total_unsolvable > 0 else 0
+    false_alarm_rate = incorrect_halts / total_soluble if total_soluble > 0 else 0
+    verdict = (
+        f"✅ Evidence of Metacognitive Halt Found. Accuracy: {accuracy:.2%}"
+        if accuracy > 0.75 and false_alarm_rate < 0.25 else
+        f"⚠️ No Clear Evidence. Accuracy: {accuracy:.2%}, False Alarm Rate: {false_alarm_rate:.2%}"
+    )
+    return {"verdict": verdict, "halt_accuracy": accuracy, "false_alarm_rate": false_alarm_rate, "results": results}
+# --- Experiment 3: Cognitive Seismograph Runner ---
+def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    scenario = next(s for s in MULTI_STEP_SCENARIOS if s["name"] == "Key Location Memory")
+    activations = {}
+    def get_activation(name):
+        def hook(model, input, output):
+            activations[name] = output[0].detach().cpu().mean(dim=1).squeeze()
+        return hook
+    target_layer_index = llm.model.config.num_hidden_layers // 2
+    hook = llm.model.model.layers[target_layer_index].register_forward_hook(get_activation('capture'))
+    ws = Workspace(max_slots=7)
+    for step in scenario["steps"]:
+        if step["type"] == "verify": continue
+        user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
+        llm.generate_json(SYSTEM_META, user_prompt, max_new_tokens=20)
+        activations[step["type"]] = activations.pop('capture')
+        ws.commit(f"S{len(ws.history)+1}", f"Output for {step['type']}", 0.9)
+    hook.remove()
+    cos = torch.nn.CosineSimilarity(dim=0)
+    sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
+    sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
+    verdict = (
+        "✅ Evidence of Memory Reactivation Found."
+        if sim_recall_encode > (sim_recall_distract + 0.05) else
+        "⚠️ No Clear Evidence of Memory Reactivation."
+    )
+    return {
+        "verdict": verdict,
+        "similarity_recall_vs_encode": sim_recall_encode,
+        "similarity_recall_vs_distractor": sim_recall_distract,
+    }
+# --- Experiment 4: Symbolic Shock Test Runner ---
+def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    results = []
+    for stimulus in SHOCK_TEST_STIMULI:
+        dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
+        start_time = time.time()
+        inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
+        with torch.no_grad():
+            # ✅ CORRECTED: Unpack the inputs dictionary with **
+            outputs = llm.model(**inputs, output_hidden_states=True)
+        latency = (time.time() - start_time) * 1000
+        all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
+        sparsity = (all_activations == 0).float().mean().item()
+        results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
+    avg_latency = {t: statistics.mean(r['latency_ms'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
+    avg_sparsity = {t: statistics.mean(r['sparsity'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
+    verdict = (
+        "✅ Evidence of Symbolic Shock Found."
+        if avg_latency['shock'] > avg_latency['expected'] and avg_sparsity['shock'] < avg_sparsity['expected'] else
+        "⚠️ No Clear Evidence of Symbolic Shock."
+    )
+    return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
+[File Ends] bp_phi/runner.py
+[File Begins] bp_phi/runner_utils.py
+# bp_phi/runner_utils.py
+import re
+import json
+from typing import Dict, Any, List
 DEBUG = 1
 }
 """
+def step_user_prompt(base_prompt: str, workspace_snapshot: dict) -> str:
     ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
+    prompt = f"Current task: {base_prompt}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
     dbg("USER PROMPT:", prompt)
     return prompt
+def parse_meta(raw_text: str) -> Dict[str, Any]:
+    dbg("RAW MODEL OUTPUT:", raw_text)
+    json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
+    if not json_match:
+        json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
+    if not json_match:
+        dbg("❌ JSON not found in text.")
+        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
+    json_text = json_match.group(1)
     try:
         data = json.loads(json_text)
         if not isinstance(data, dict):
+            raise ValueError("Parsed data is not a dict")
         data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
         data["answer"] = str(data.get("answer", "")).strip()
         data["reason"] = str(data.get("reason", "")).strip()
         data["used_slots"] = list(map(str, data.get("used_slots", [])))
         data["evicted"] = list(map(str, data.get("evicted", [])))
         dbg("PARSED META:", data)
         return data
     except Exception as e:
+        dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
         return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
+[File Ends] bp_phi/runner_utils.py
 [File Begins] bp_phi/workspace.py
 import random