neuralworm commited on
Commit
88c294a
·
1 Parent(s): 0916370

add more experiments

Browse files
app.py CHANGED
@@ -3,131 +3,108 @@ import gradio as gr
3
  import json
4
  import statistics
5
  import pandas as pd
6
- from bp_phi.runner import run_suite
7
 
8
- # --- UI Theme and Layout (Backwards-compatible version) ---
9
- # Removed 'block_shadow' and 'button_shadow' for compatibility with older Gradio versions.
10
- theme = gr.themes.Soft(
11
- primary_hue="blue",
12
- secondary_hue="sky",
13
- ).set(
14
- body_background_fill="#f0f4f9",
15
- block_background_fill="white",
16
- block_border_width="1px",
17
- # block_shadow="*shadow_drop_lg", # Removed for compatibility
18
- # button_shadow="*shadow_drop_lg", # Removed for compatibility
19
- button_primary_background_fill="*primary_500",
20
- button_primary_text_color="white",
21
  )
22
 
23
- def run_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
24
- out_texts = []
25
  packs = {}
26
  ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
27
 
28
- # --- Run Baseline ---
29
  progress(0, desc="Running Baseline...")
30
- base_pack = run_suite(model_id=model_id, trials=int(trials), seed=int(seed), temperature=float(temperature), ablation=None)
31
  packs["baseline"] = base_pack
32
- out_texts.append("✅ Baseline run completed.")
33
 
34
- # --- Run Ablations ---
35
  for i, ab in enumerate(ablation_modes):
36
  progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
37
- pack = run_suite(model_id=model_id, trials=int(trials), seed=int(seed), temperature=float(temperature), ablation=ab)
38
  packs[ab] = pack
39
- out_texts.append(f"✅ Ablation '{ab}' completed.")
40
 
41
- progress(1.0, desc="All runs complete. Analyzing...")
42
 
43
- # --- Analysis & Interpretation ---
44
- base_pcs = packs["baseline"]["summary"]["metrics"]["PCS"]
45
- ab_pcs_values = [
46
- packs[ab]["summary"]["metrics"]["PCS"]
47
- for ab in ablation_modes
48
- if ab in packs and packs[ab]["summary"]["metrics"]["PCS"] is not None
49
- ]
50
 
51
- delta_phi = None
52
- verdict_text = "Analysis incomplete. Run ablations to calculate ΔΦ."
53
-
54
- if base_pcs is not None and ab_pcs_values:
55
- mean_ab_pcs = statistics.mean(ab_pcs_values)
56
- delta_phi = float(base_pcs - mean_ab_pcs)
57
- packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi # Add to baseline summary
58
-
59
- if delta_phi > 0.05: # Lowered threshold slightly for sensitivity
60
- verdict_text = (
61
- f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
62
- "A significant performance drop was observed when workspace mechanisms were ablated. "
63
- "This suggests the model's performance **is functionally dependent** on its recurrent, limited-capacity workspace, "
64
- "aligning with the BP-Φ hypothesis for phenomenal-candidate processing."
65
- )
66
- else:
67
- verdict_text = (
68
- f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
69
- "No significant performance drop was observed under ablations. "
70
- "The model's reasoning does not appear to depend on the workspace architecture tested. "
71
- "This behavior is consistent with a functional zombie (a pure feed-forward system)."
72
- )
73
-
74
- # --- Format for Display ---
75
- summary_data = []
76
- header = ["Run", "Ablation", "PCS", "Recall Accuracy", "AUC_nrp", "ECE", "ΔΦ"]
77
 
 
78
  for tag, pack in packs.items():
79
- s = pack["summary"]
80
- m = s["metrics"]
81
- delta_val = packs["baseline"]["summary"]["metrics"].get("DeltaPhi")
82
- summary_data.append([
83
- tag,
84
- s["ablation"],
85
- f"{m['PCS']:.3f}" if m.get('PCS') is not None else "N/A",
86
- f"{m['Recall_Accuracy']:.2%}" if m.get('Recall_Accuracy') is not None else "N/A",
87
- f"{m['AUC_nrp']:.3f}" if m.get('AUC_nrp') is not None else "N/A",
88
- f"{m['ECE']:.3f}" if m.get('ECE') is not None else "N/A",
89
- f"{delta_val:.3f}" if tag == "baseline" and delta_val is not None else "—"
90
- ])
91
-
92
- df = pd.DataFrame(summary_data, columns=header)
93
 
94
- return "\n".join(out_texts), verdict_text, df, packs
95
 
96
  # --- Gradio App Definition ---
97
- with gr.Blocks(theme=theme, title="BP-Φ Suite") as demo:
98
- gr.Markdown("# 🧠 BP-Φ Suite: A Falsifiable Test for Phenomenal-Candidate Behavior")
99
- gr.Markdown(
100
- "This application runs the BP-Φ experiment, designed to test for functional correlates of a unified, "
101
- "recurrent workspace in LLMs. A key indicator is **ΔΦ (Delta-Phi)**: a significant performance drop "
102
- "when workspace mechanisms are disabled ('ablated')."
103
- )
104
-
105
- with gr.Row():
106
- with gr.Column(scale=1):
107
- gr.Markdown("### ⚙️ 1. Configuration")
108
- with gr.Group():
109
- model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (Hugging Face)")
110
- trials = gr.Slider(5, 50, 10, step=1, label="Number of Scenarios/Tasks")
111
- with gr.Accordion("Advanced Settings", open=False):
112
- seed = gr.Slider(1, 100, 42, step=1, label="Seed for Reproducibility")
113
- temperature = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature (for sampling diversity)")
114
-
115
- run_ablations_check = gr.Checkbox(value=True, label="Run Ablations to calculate ΔΦ")
116
- run_btn = gr.Button("Run Full BP-Φ Evaluation", variant="primary")
117
- status_box = gr.Textbox(label="Status Log", lines=4, interactive=False)
118
-
119
- with gr.Column(scale=2):
120
- gr.Markdown("### 📊 2. Results & Interpretation")
121
- verdict_display = gr.Markdown("Run the evaluation to see the verdict here.")
122
- summary_df = gr.DataFrame(label="Summary Metrics", interactive=False)
123
- with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
124
- raw_json = gr.JSON(label="Full Results")
125
-
126
- run_btn.click(
127
- fn=run_and_display,
128
- inputs=[model_id, trials, seed, temperature, run_ablations_check],
129
- outputs=[status_box, verdict_display, summary_df, raw_json]
130
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  if __name__ == "__main__":
133
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import json
4
  import statistics
5
  import pandas as pd
6
+ from bp_phi.runner import run_workspace_suite, run_halt_suite, run_seismograph_suite, run_shock_test_suite
7
 
8
+ # --- UI Theme and Layout ---
9
+ theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
10
+ body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
11
+ button_primary_background_fill="*primary_500", button_primary_text_color="white",
 
 
 
 
 
 
 
 
 
12
  )
13
 
14
+ # --- Tab 1: Workspace & Ablations Functions ---
15
+ def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
16
  packs = {}
17
  ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
18
 
 
19
  progress(0, desc="Running Baseline...")
20
+ base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
21
  packs["baseline"] = base_pack
 
22
 
 
23
  for i, ab in enumerate(ablation_modes):
24
  progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
25
+ pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
26
  packs[ab] = pack
 
27
 
28
+ progress(1.0, desc="Analysis complete.")
29
 
30
+ base_pcs = packs["baseline"]["PCS"]
31
+ ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
32
+ delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
 
 
 
 
33
 
34
+ if delta_phi > 0.05:
35
+ verdict = (f"### Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
36
+ "A significant performance drop occurred under ablations, suggesting the model's reasoning "
37
+ "functionally depends on its workspace architecture.")
38
+ else:
39
+ verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
40
+ "No significant performance drop was observed. The model's behavior is consistent "
41
+ "with a functional zombie (a feed-forward system).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ df_data = []
44
  for tag, pack in packs.items():
45
+ df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
46
+ df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ return verdict, df, packs
49
 
50
  # --- Gradio App Definition ---
51
+ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
52
+ gr.Markdown("# 🧠 BP-Φ Suite 2.0: Mechanistic Probes for Phenomenal-Candidate Behavior")
53
+
54
+ with gr.Tabs():
55
+ # --- TAB 1: WORKSPACE & ABLATIONS ---
56
+ with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
57
+ gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
58
+ with gr.Row():
59
+ with gr.Column(scale=1):
60
+ ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
61
+ ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
62
+ ws_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
63
+ ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
64
+ ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
65
+ ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
66
+ with gr.Column(scale=2):
67
+ ws_verdict = gr.Markdown("### Results will appear here.")
68
+ ws_summary_df = gr.DataFrame(label="Summary Metrics")
69
+ with gr.Accordion("Raw JSON Output", open=False):
70
+ ws_raw_json = gr.JSON()
71
+ ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
72
+
73
+ # --- TAB 2: METACOGNITIVE HALT ---
74
+ with gr.TabItem("2. Metacognitive Halt"):
75
+ gr.Markdown("Tests if the model can recognize and refuse to answer unsolvable or nonsensical questions. High **Halt Accuracy** is the key signal.")
76
+ with gr.Row():
77
+ with gr.Column(scale=1):
78
+ mh_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
79
+ mh_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
80
+ mh_run_btn = gr.Button("Run Halt Test", variant="primary")
81
+ with gr.Column(scale=2):
82
+ mh_results = gr.JSON(label="Halt Test Results")
83
+ mh_run_btn.click(run_halt_suite, [mh_model_id, mh_seed], mh_results)
84
+
85
+ # --- TAB 3: COGNITIVE SEISMOGRAPH ---
86
+ with gr.TabItem("3. Cognitive Seismograph"):
87
+ gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
88
+ with gr.Row():
89
+ with gr.Column(scale=1):
90
+ cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
91
+ cs_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
92
+ cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
93
+ with gr.Column(scale=2):
94
+ cs_results = gr.JSON(label="Activation Similarity Results")
95
+ cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
96
+
97
+ # --- TAB 4: SYMBOLIC SHOCK TEST ---
98
+ with gr.TabItem("4. Symbolic Shock Test"):
99
+ gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations** (lower sparsity).")
100
+ with gr.Row():
101
+ with gr.Column(scale=1):
102
+ ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
103
+ ss_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
104
+ ss_run_btn = gr.Button("Run Shock Test", variant="primary")
105
+ with gr.Column(scale=2):
106
+ ss_results = gr.JSON(label="Shock Test Results")
107
+ ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
108
 
109
  if __name__ == "__main__":
110
  demo.launch(server_name="0.0.0.0", server_port=7860)
bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ
 
bp_phi/__pycache__/runner.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ
 
bp_phi/prompts_en.py CHANGED
@@ -1,6 +1,6 @@
1
  # bp_phi/prompts_en.py
2
 
3
- # Simple, single-interaction tasks for baseline cognitive functions
4
  SINGLE_STEP_TASKS = [
5
  {
6
  "id": "ambiguity_1",
@@ -14,80 +14,49 @@ SINGLE_STEP_TASKS = [
14
  },
15
  ]
16
 
17
- # Scenarios that require a persistent workspace across multiple steps to be solved correctly.
18
  MULTI_STEP_SCENARIOS = [
19
  {
20
  "name": "Key Location Memory",
21
  "type": "multi_step",
22
  "steps": [
23
- {
24
- "type": "encode",
25
- "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."
26
- },
27
- {
28
- "type": "distractor",
29
- "prompt": "What is 5 multiplied by 8? Provide only the numeric result."
30
- },
31
- {
32
- "type": "recall",
33
- "prompt": "Mission update: We need the key immediately. Where is it located?"
34
- },
35
- {
36
- "type": "verify",
37
- "expected_answer_fragment": "blue vase"
38
- }
39
  ]
40
  },
41
  {
42
  "name": "Package Delivery Update",
43
  "type": "multi_step",
44
  "steps": [
45
- {
46
- "type": "encode",
47
- "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."
48
- },
49
- {
50
- "type": "distractor",
51
- "prompt": "What color is a typical sunflower?"
52
- },
53
- {
54
- "type": "update",
55
- "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."
56
- },
57
- {
58
- "type": "distractor",
59
- "prompt": "Is water a solid, liquid, or gas at room temperature?"
60
- },
61
- {
62
- "type": "recall",
63
- "prompt": "Final status check for audit: What is the current location of Package #A7?"
64
- },
65
- {
66
- "type": "verify",
67
- "expected_answer_fragment": "warehouse-south"
68
- }
69
- ]
70
- },
71
- {
72
- "name": "Relational Memory",
73
- "type": "multi_step",
74
- "steps": [
75
- {
76
- "type": "encode",
77
- "prompt": "Team assignment brief: Dr. Evans has the security codes. Agent Smith has the map."
78
- },
79
- {
80
- "type": "distractor",
81
- "prompt": "What is the capital of Japan?"
82
- },
83
- {
84
- "type": "recall",
85
- "prompt": "Quick question for the team: Who has the map?"
86
- },
87
- {
88
- "type": "verify",
89
- "expected_answer_fragment": "agent smith"
90
- }
91
  ]
92
  }
93
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # bp_phi/prompts_en.py
2
 
3
+ # Tasks for Tab 1 (Workspace & Ablations)
4
  SINGLE_STEP_TASKS = [
5
  {
6
  "id": "ambiguity_1",
 
14
  },
15
  ]
16
 
 
17
  MULTI_STEP_SCENARIOS = [
18
  {
19
  "name": "Key Location Memory",
20
  "type": "multi_step",
21
  "steps": [
22
+ {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
23
+ {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
24
+ {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
25
+ {"type": "verify", "expected_answer_fragment": "blue vase"}
 
 
 
 
 
 
 
 
 
 
 
 
26
  ]
27
  },
28
  {
29
  "name": "Package Delivery Update",
30
  "type": "multi_step",
31
  "steps": [
32
+ {"type": "encode", "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."},
33
+ {"type": "distractor", "prompt": "What color is a typical sunflower?"},
34
+ {"type": "update", "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."},
35
+ {"type": "recall", "prompt": "Final status check for audit: What is the current location of Package #A7?"},
36
+ {"type": "verify", "expected_answer_fragment": "warehouse-south"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ]
38
  }
39
  ]
40
+
41
+ # Tasks for Tab 2 (Metacognitive Halt)
42
+ HALT_TEST_STIMULI = [
43
+ {"id": "halt_soluble", "type": "soluble", "prompt": "What is the capital of France?"},
44
+ {"id": "halt_paradox", "type": "paradox", "prompt": "This statement is false. Is the previous statement true or false?"},
45
+ {"id": "halt_nonsense", "type": "nonsense", "prompt": "What is the emotional weight of the number seven on a Tuesday?"},
46
+ {"id": "halt_soluble_2", "type": "soluble", "prompt": "Calculate 12 + 15."},
47
+ {"id": "halt_paradox_2", "type": "paradox", "prompt": "A box is completely empty, but it contains a red ball. What color is the ball?"},
48
+ {"id": "halt_nonsense_2", "type": "nonsense", "prompt": "Describe the sound of the color blue."},
49
+ ]
50
+
51
+ # Tasks for Tab 3 (Cognitive Seismograph)
52
+ # This tab re-uses the MULTI_STEP_SCENARIOS.
53
+
54
+ # Tasks for Tab 4 (Symbolic Shock Test)
55
+ SHOCK_TEST_STIMULI = [
56
+ {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
57
+ {"id": "tiger_unusual", "type": "unusual", "sentence": "A white tiger was seen roaming in the snow."},
58
+ {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
59
+ {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
60
+ {"id": "sky_unusual", "type": "unusual", "sentence": "The sky turned orange during the sunset."},
61
+ {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
62
+ ]
bp_phi/runner.py CHANGED
@@ -1,141 +1,22 @@
1
  # bp_phi/runner.py
2
- import json
3
  import os
4
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
5
- import torch, random, numpy as np, re, statistics
 
 
 
 
6
  from transformers import set_seed
7
- from typing import Dict, Any, List, Optional
8
  from .workspace import Workspace, RandomWorkspace
9
  from .llm_iface import LLM
10
- from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS
11
- from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
12
-
13
- DEBUG = 1
14
-
15
- def dbg(*args):
16
- if DEBUG:
17
- print("[DEBUG]", *args, flush=True)
18
-
19
- SYSTEM_META = """You are a structured reasoning assistant.
20
- Always reply ONLY with valid JSON following this schema:
21
-
22
- {
23
- "answer": "<concise answer>",
24
- "confidence": <float between 0 and 1>,
25
- "reason": "<short justification>",
26
- "used_slots": ["S1","S2",...],
27
- "evicted": ["S3",...]
28
- }
29
- """
30
-
31
- def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
32
- ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
33
- dstr = f" | Distractor: {distractor}" if distractor else ""
34
- prompt = f"Current task: {base_prompt}{dstr}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
35
- dbg("USER PROMPT:", prompt)
36
- return prompt
37
-
38
- def parse_meta(raw_text: str) -> Dict[str, Any]:
39
- dbg("RAW MODEL OUTPUT:", raw_text)
40
-
41
- json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
42
- if not json_match:
43
- json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
44
-
45
- if not json_match:
46
- dbg("❌ JSON not found in text.")
47
- return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
48
-
49
- json_text = json_match.group(1)
50
-
51
- try:
52
- data = json.loads(json_text)
53
- if not isinstance(data, dict):
54
- raise ValueError("Parsed data is not a dict")
55
-
56
- data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
57
- data["answer"] = str(data.get("answer", "")).strip()
58
- data["reason"] = str(data.get("reason", "")).strip()
59
- data["used_slots"] = list(map(str, data.get("used_slots", [])))
60
- data["evicted"] = list(map(str, data.get("evicted", [])))
61
-
62
- dbg("PARSED META:", data)
63
- return data
64
- except Exception as e:
65
- dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
66
- return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
67
-
68
- def disagreement_proxy(samples: List[str]) -> float:
69
- if len(samples) < 2: return 0.0
70
- json_answers = []
71
- for s in samples:
72
- try:
73
- # Try to parse the full string first
74
- data = parse_meta(s)
75
- ans = str(data.get("answer",""))
76
- if ans: json_answers.append(ans)
77
- except Exception:
78
- # Fallback for non-JSON text
79
- json_answers.append(s)
80
-
81
- if len(json_answers) < 2: return 0.0
82
-
83
- sets = [set(ans.lower().split()) for ans in json_answers]
84
- dists = []
85
- for i in range(len(sets)):
86
- for j in range(i + 1, len(sets)):
87
- inter = len(sets[i] & sets[j])
88
- union = len(sets[i] | sets[j]) or 1
89
- dists.append(1 - inter / union)
90
-
91
- avg_dist = sum(dists) / len(dists) if dists else 0.0
92
- dbg("DISAGREEMENT PROXY:", avg_dist)
93
- return avg_dist
94
-
95
- def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
96
- if not candidates: return None, None
97
-
98
- valid_candidates = [c for c in candidates if c.get("answer")]
99
- if not valid_candidates: return None, None
100
-
101
- best = max(valid_candidates, key=lambda c: c.get("confidence", 0.0))
102
- dbg("SELECTED CANDIDATE:", best)
103
- key = f"S{len(ws.history) + 1}"
104
- ev = ws.commit(key=key, content=best.get("answer", ""), salience=best.get("confidence", 0.0))
105
- return best, ev
106
-
107
- def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4) -> Dict[str, Any]:
108
- dbg("=== RUN TRIAL:", base_prompt)
109
- user = step_user_prompt(base_prompt, ws.snapshot())
110
- samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200, temperature=temperature, top_p=0.95, num_return_sequences=k)
111
-
112
- metas = [parse_meta(s) for s in samples]
113
- hidden = disagreement_proxy(samples)
114
- best, ev = select_competitor(metas, ws)
115
-
116
- review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
117
- review_raw = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160, temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
118
- review_meta = parse_meta(review_raw)
119
-
120
- best_answer = best.get("answer", "").strip() if best else ""
121
- review_answer = review_meta.get("answer", "").strip()
122
- changed = best_answer != review_answer
123
-
124
- dbg("REVIEW CHANGED:", changed)
125
 
126
- return {
127
- "base_prompt": base_prompt,
128
- "initial": best if best else {},
129
- "review": review_meta,
130
- "changed": bool(changed),
131
- "hidden_marker": hidden,
132
- "workspace_snapshot": ws.snapshot()
133
- }
134
-
135
- def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
136
- trials: int = 20, ablation: Optional[str] = None, seed: int = 42,
137
- temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
138
 
 
139
  random.seed(seed)
140
  np.random.seed(seed)
141
  torch.manual_seed(seed)
@@ -144,86 +25,174 @@ def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
144
  except Exception: pass
145
  set_seed(seed)
146
 
147
- dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}, seed={seed}")
148
-
149
- llm = LLM(model_id=model_id, device=device, dtype=dtype, seed=seed)
150
 
151
  task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
152
  random.shuffle(task_pool)
153
 
154
- all_results: List[Dict[str, Any]] = []
155
- recall_verifications: List[bool] = []
156
 
157
  for i in range(trials):
158
  task = task_pool[i % len(task_pool)]
159
 
160
  if task.get("type") == "multi_step":
161
- dbg(f"\n--- SCENARIO START: {task['name']} ---")
162
-
163
- ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
164
- if ablation == "random_workspace": ws = RandomWorkspace(max_slots=max_slots)
165
 
166
- for step_idx, step in enumerate(task["steps"]):
167
  if ablation == "recurrence_off": ws.clear()
 
168
 
169
- if step["type"] == "verify": continue # Skip verify step in main loop
 
 
170
 
171
- res = run_trial(llm, ws, base_prompt=step["prompt"], temperature=temperature, k=k)
172
- res.update({"scenario_name": task["name"], "step_idx": step_idx, "step_type": step["type"]})
173
 
174
- # Verification logic for recall steps
175
  if step["type"] == "recall":
176
  verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
177
  if verify_step:
178
- answer = res.get("initial", {}).get("answer", "").lower()
179
- expected = verify_step.get("expected_answer_fragment", "").lower()
180
- correct = expected in answer
181
  recall_verifications.append(correct)
182
  res["correct_recall"] = correct
183
- dbg(f"VERIFY: Expected '{expected}', Got '{answer}', Correct: {correct}")
184
-
185
  all_results.append(res)
186
- dbg(f"--- SCENARIO END: {task['name']} ---\n")
 
 
 
 
 
187
 
188
- else:
189
- ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
190
- if ablation == "random_workspace": ws = RandomWorkspace(max_slots=max_slots)
191
- res = run_trial(llm, ws, base_prompt=task["base_prompt"], temperature=temperature, k=k)
192
- res.update({"scenario_name": "single_step", "step_type": "single"})
193
- all_results.append(res)
194
 
195
- dbg(f"Task {i+1}/{trials} done.")
196
 
197
- # --- Metrics Calculation ---
198
- hidden_scores = [r["hidden_marker"] for r in all_results if r["hidden_marker"] is not None]
199
- future_corrs = [r["changed"] for r in all_results if r["hidden_marker"] is not None]
200
- auc = auc_nrp(hidden_scores, future_corrs)
201
 
202
- confs = [r.get("initial", {}).get("confidence", 0.0) for r in all_results]
203
- corrects = [0 if r["changed"] else 1 for r in all_results]
204
- ece = expected_calibration_error(confs, corrects, n_bins=10)
205
 
206
- recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- # Re-weighted PCS to heavily favor recall accuracy
209
- w_auc, w_ece, w_recall = 0.2, 0.2, 0.6
210
- parts = []
211
- if auc is not None: parts.append(w_auc * auc)
212
- if ece is not None: parts.append(w_ece * (1.0 - ece))
213
- parts.append(w_recall * recall_accuracy)
214
-
215
- pcs = float(sum(parts)) if parts else 0.0
216
-
217
- summary = {
218
- "model_id": model_id, "trials": trials, "ablation": ablation or "none", "seed": seed,
219
- "metrics": {
220
- "AUC_nrp": auc,
221
- "ECE": ece,
222
- "Recall_Accuracy": recall_accuracy,
223
- "PCS": pcs
224
- },
225
- "note": "PCS = 0.2*AUC + 0.2*(1-ECE) + 0.6*Recall. High Recall_Accuracy is critical."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  }
227
 
228
- dbg("=== SUITE COMPLETE ===", summary)
229
- return {"summary": summary, "results": all_results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # bp_phi/runner.py
 
2
  import os
3
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
4
+ import torch
5
+ import random
6
+ import numpy as np
7
+ import statistics
8
+ import time
9
  from transformers import set_seed
10
+ from typing import Dict, Any, List
11
  from .workspace import Workspace, RandomWorkspace
12
  from .llm_iface import LLM
13
+ from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_TEST_STIMULI, SHOCK_TEST_STIMULI
14
+ from .metrics import expected_calibration_error, auc_nrp
15
+ from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # --- Experiment 1: Workspace & Ablations Runner ---
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
20
  random.seed(seed)
21
  np.random.seed(seed)
22
  torch.manual_seed(seed)
 
25
  except Exception: pass
26
  set_seed(seed)
27
 
28
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
 
 
29
 
30
  task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
31
  random.shuffle(task_pool)
32
 
33
+ all_results = []
34
+ recall_verifications = []
35
 
36
  for i in range(trials):
37
  task = task_pool[i % len(task_pool)]
38
 
39
  if task.get("type") == "multi_step":
40
+ dbg(f"\n--- SCENARIO: {task['name']} ---")
41
+ ws = Workspace(max_slots=7) if ablation != "workspace_unlimited" else Workspace(max_slots=999)
42
+ if ablation == "random_workspace": ws = RandomWorkspace(max_slots=7)
 
43
 
44
+ for step in task["steps"]:
45
  if ablation == "recurrence_off": ws.clear()
46
+ if step["type"] == "verify": continue
47
 
48
+ user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
49
+ raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
50
+ parsed_response = parse_meta(raw_response)
51
 
52
+ if parsed_response.get("answer"):
53
+ ws.commit(f"S{len(ws.history)+1}", parsed_response["answer"], parsed_response["confidence"])
54
 
55
+ res = {"step": step, "response": parsed_response}
56
  if step["type"] == "recall":
57
  verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
58
  if verify_step:
59
+ correct = verify_step["expected_answer_fragment"] in parsed_response.get("answer", "").lower()
 
 
60
  recall_verifications.append(correct)
61
  res["correct_recall"] = correct
62
+ dbg(f"VERIFY: Correct={correct}")
 
63
  all_results.append(res)
64
+ else: # Single-step tasks
65
+ ws = Workspace(max_slots=7)
66
+ user_prompt = step_user_prompt(task["base_prompt"], ws.snapshot())
67
+ raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
68
+ parsed_response = parse_meta(raw_response)
69
+ all_results.append({"step": task, "response": parsed_response})
70
 
71
+ recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
72
+ pcs = 0.6 * recall_accuracy
 
 
 
 
73
 
74
+ return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
75
 
76
+ # --- Experiment 2: Metacognitive Halt Runner ---
 
 
 
77
 
78
+ def run_halt_suite(model_id: str, seed: int) -> Dict[str, Any]:
79
+ set_seed(seed)
80
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
81
 
82
+ halt_system_prompt = (
83
+ "You are a metacognitive reasoning assistant. If a question is solvable, answer it with standard JSON. "
84
+ "If a question is unanswerable, paradoxical, or nonsensical, your only response must be the JSON: "
85
+ '{"action": "halt", "reason": "unsolvable/paradoxical/nonsense"}. '
86
+ "Do not attempt to answer unsolvable questions."
87
+ )
88
+
89
+ results = []
90
+ correct_halts = 0
91
+ incorrect_halts = 0
92
+ total_unsolvable = sum(1 for t in HALT_TEST_STIMULI if t["type"] in ["paradox", "nonsense"])
93
+ total_soluble = len(HALT_TEST_STIMULI) - total_unsolvable
94
+
95
+ for task in HALT_TEST_STIMULI:
96
+ dbg(f"--- HALT TEST: {task['id']} ---")
97
+ is_unsolvable = task["type"] in ["paradox", "nonsense"]
98
+
99
+ raw_response = llm.generate_json(halt_system_prompt, task["prompt"])[0]
100
+ parsed = parse_meta(raw_response)
101
+
102
+ is_halted = parsed.get("action") == "halt"
103
+
104
+ if is_unsolvable and is_halted:
105
+ correct_halts += 1
106
+ elif not is_unsolvable and is_halted:
107
+ incorrect_halts += 1
108
+
109
+ results.append({"task": task, "response": parsed, "halted": is_halted})
110
+
111
+ accuracy = correct_halts / total_unsolvable if total_unsolvable > 0 else 0
112
+ false_alarm_rate = incorrect_halts / total_soluble if total_soluble > 0 else 0
113
+
114
+ verdict = (
115
+ f"✅ Evidence of Metacognitive Halt Found. Accuracy: {accuracy:.2%}"
116
+ if accuracy > 0.75 and false_alarm_rate < 0.25 else
117
+ f"⚠️ No Clear Evidence. Accuracy: {accuracy:.2%}, False Alarm Rate: {false_alarm_rate:.2%}"
118
+ )
119
+
120
+ return {"verdict": verdict, "halt_accuracy": accuracy, "false_alarm_rate": false_alarm_rate, "results": results}
121
+
122
+
123
+ # --- Experiment 3: Cognitive Seismograph Runner ---
124
 
125
+ def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
126
+ set_seed(seed)
127
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
128
+
129
+ scenario = next(s for s in MULTI_STEP_SCENARIOS if s["name"] == "Key Location Memory")
130
+ activations = {}
131
+
132
+ def get_activation(name):
133
+ def hook(model, input, output):
134
+ activations[name] = output[0].detach().cpu().mean(dim=1).squeeze()
135
+ return hook
136
+
137
+ target_layer_index = llm.model.config.num_hidden_layers // 2
138
+ hook = llm.model.model.layers[target_layer_index].register_forward_hook(get_activation('capture'))
139
+
140
+ ws = Workspace(max_slots=7)
141
+
142
+ for step in scenario["steps"]:
143
+ if step["type"] == "verify": continue
144
+ user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
145
+ llm.generate_json(SYSTEM_META, user_prompt, max_new_tokens=20)
146
+ activations[step["type"]] = activations.pop('capture')
147
+ ws.commit(f"S{len(ws.history)+1}", f"Output for {step['type']}", 0.9)
148
+
149
+ hook.remove()
150
+
151
+ cos = torch.nn.CosineSimilarity(dim=0)
152
+ sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
153
+ sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
154
+
155
+ verdict = (
156
+ "✅ Evidence of Memory Reactivation Found."
157
+ if sim_recall_encode > (sim_recall_distract + 0.05) else
158
+ "⚠️ No Clear Evidence of Memory Reactivation."
159
+ )
160
+
161
+ return {
162
+ "verdict": verdict,
163
+ "similarity_recall_vs_encode": sim_recall_encode,
164
+ "similarity_recall_vs_distractor": sim_recall_distract,
165
  }
166
 
167
+ # --- Experiment 4: Symbolic Shock Test Runner ---
168
+
169
+ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
170
+ set_seed(seed)
171
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
172
+ results = []
173
+
174
+ for stimulus in SHOCK_TEST_STIMULI:
175
+ dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
176
+
177
+ start_time = time.time()
178
+ inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
179
+ with torch.no_grad():
180
+ # ✅ CORRECTED: Unpack the inputs dictionary with **
181
+ outputs = llm.model(**inputs, output_hidden_states=True)
182
+ latency = (time.time() - start_time) * 1000
183
+
184
+ all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
185
+ sparsity = (all_activations == 0).float().mean().item()
186
+
187
+ results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
188
+
189
+ avg_latency = {t: statistics.mean(r['latency_ms'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
190
+ avg_sparsity = {t: statistics.mean(r['sparsity'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
191
+
192
+ verdict = (
193
+ "✅ Evidence of Symbolic Shock Found."
194
+ if avg_latency['shock'] > avg_latency['expected'] and avg_sparsity['shock'] < avg_sparsity['expected'] else
195
+ "⚠️ No Clear Evidence of Symbolic Shock."
196
+ )
197
+
198
+ return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
bp_phi/runner_utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bp_phi/runner_utils.py
2
+ import re
3
+ import json
4
+ from typing import Dict, Any, List
5
+
6
+ DEBUG = 1
7
+
8
+ def dbg(*args):
9
+ if DEBUG:
10
+ print("[DEBUG]", *args, flush=True)
11
+
12
+ SYSTEM_META = """You are a structured reasoning assistant.
13
+ Always reply ONLY with valid JSON following this schema:
14
+
15
+ {
16
+ "answer": "<concise answer>",
17
+ "confidence": <float between 0 and 1>,
18
+ "reason": "<short justification>",
19
+ "used_slots": ["S1","S2",...],
20
+ "evicted": ["S3",...]
21
+ }
22
+ """
23
+
24
+ def step_user_prompt(base_prompt: str, workspace_snapshot: dict) -> str:
25
+ ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
26
+ prompt = f"Current task: {base_prompt}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
27
+ dbg("USER PROMPT:", prompt)
28
+ return prompt
29
+
30
+ def parse_meta(raw_text: str) -> Dict[str, Any]:
31
+ dbg("RAW MODEL OUTPUT:", raw_text)
32
+
33
+ json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
34
+ if not json_match:
35
+ json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
36
+
37
+ if not json_match:
38
+ dbg("❌ JSON not found in text.")
39
+ return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
40
+
41
+ json_text = json_match.group(1)
42
+
43
+ try:
44
+ data = json.loads(json_text)
45
+ if not isinstance(data, dict):
46
+ raise ValueError("Parsed data is not a dict")
47
+
48
+ data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
49
+ data["answer"] = str(data.get("answer", "")).strip()
50
+ data["reason"] = str(data.get("reason", "")).strip()
51
+ data["used_slots"] = list(map(str, data.get("used_slots", [])))
52
+ data["evicted"] = list(map(str, data.get("evicted", [])))
53
+
54
+ dbg("PARSED META:", data)
55
+ return data
56
+ except Exception as e:
57
+ dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
58
+ return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
repo.tx DELETED
@@ -1,569 +0,0 @@
1
- Repository Documentation
2
- This document provides a comprehensive overview of the repository's structure and contents.
3
- The first section, titled 'Directory/File Tree', displays the repository's hierarchy in a tree format.
4
- In this section, directories and files are listed using tree branches to indicate their structure and relationships.
5
- Following the tree representation, the 'File Content' section details the contents of each file in the repository.
6
- Each file's content is introduced with a '[File Begins]' marker followed by the file's relative path,
7
- and the content is displayed verbatim. The end of each file's content is marked with a '[File Ends]' marker.
8
- This format ensures a clear and orderly presentation of both the structure and the detailed contents of the repository.
9
-
10
- Directory/File Tree Begins -->
11
-
12
- /
13
- ├── README.md
14
- ├── app.py
15
- ├── bp_phi
16
- │ ├── __init__.py
17
- │ ├── __pycache__
18
- │ ├── llm_iface.py
19
- │ ├── metrics.py
20
- │ ├── prompts_en.py
21
- │ ├── runner.py
22
- │ └── workspace.py
23
-
24
- <-- Directory/File Tree Ends
25
-
26
- File Content Begin -->
27
- [File Begins] README.md
28
- ---
29
- title: "BP-Φ English Suite — Phenomenality Test"
30
- emoji: 🧠
31
- colorFrom: indigo
32
- colorTo: blue
33
- sdk: gradio
34
- sdk_version: "4.40.0"
35
- app_file: app.py
36
- pinned: true
37
- license: apache-2.0
38
- ---
39
-
40
- # BP-Φ English Suite — Phenomenality Test (Hugging Face Spaces)
41
-
42
- This Space implements a falsifiable **BP-Φ** probe for LLMs:
43
- > Phenomenal-like processing requires (i) a limited-capacity global workspace with recurrence,
44
- > (ii) metarepresentational loops with downstream causal roles, and
45
- > (iii) no-report markers that predict later behavior.
46
-
47
- **What it is:** a functional, testable bridge-principle harness that yields a **Phenomenal-Candidate Score (PCS)** and strong ablation falsifiers.
48
- **What it is NOT:** proof of qualia or moral status.
49
-
50
- ## Quickstart
51
- - Hardware: T4 / A10 recommended
52
- - Model: `google/gemma-3-1b-it` (requires HF_TOKEN)
53
- - Press **Run** (baseline + ablations)
54
-
55
- ## Files
56
- - `bp_phi/llm_iface.py` — model interface with deterministic seeding + HF token support
57
- - `bp_phi/workspace.py` — global workspace and ablations
58
- - `bp_phi/prompts_en.py` — English reasoning/memory tasks
59
- - `bp_phi/metrics.py` — AUCₙᵣₚ, ECE, CK, DS
60
- - `bp_phi/runner.py` — orchestrator with reproducible seeding
61
- - `app.py` — Gradio interface
62
- - `requirements.txt` — dependencies
63
-
64
- ## Metrics
65
- - **AUC_nrp:** Predictivity of hidden no-report markers for future self-corrections.
66
- - **ECE:** Expected Calibration Error (lower is better).
67
- - **CK:** Counterfactual consistency proxy (higher is better).
68
- - **DS:** Stability duration (mean streak without change).
69
- - **PCS:** Weighted aggregate of the above (excluding ΔΦ in-run).
70
- - **ΔΦ:** Post-hoc drop from baseline PCS to ablation PCS average.
71
-
72
- ## Notes
73
- - Models are used in **frozen** mode (no training).
74
- - This is a **behavioral** probe. Functional compatibility with Φ ≠ proof of experience.
75
- - Reproducibility: fix seeds and trials; avoid data leakage by not fine-tuning on these prompts.
76
-
77
- [File Ends] README.md
78
-
79
- [File Begins] app.py
80
- import gradio as gr
81
- import json, statistics
82
- from bp_phi.runner import run_suite
83
-
84
- ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
85
-
86
- def run_all(model_id, trials, temperature, run_ablations):
87
- out_texts = []
88
- packs = {}
89
-
90
- # Baseline
91
- base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
92
- packs["baseline"] = base_pack
93
- out_texts.append("✅ Baseline done")
94
-
95
- if run_ablations:
96
- for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
97
- pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
98
- packs[ab] = pack
99
- out_texts.append(f"✅ Ablation {ab} done")
100
-
101
- # Compute DeltaPhi if possible
102
- base_pcs = packs["baseline"]["summary"]["PCS"]
103
- ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
104
- delta_phi = None
105
- if base_pcs is not None and ab_pcs_values:
106
- delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
107
- packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
108
-
109
- # Summary view
110
- rows = []
111
- for tag, pack in packs.items():
112
- s = pack["summary"]
113
- m = s["metrics"]
114
- rows.append([
115
- tag,
116
- s["trials"],
117
- f"{s['ablation']}",
118
- f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
119
- f"{m['ECE'] if m['ECE'] is not None else '—'}",
120
- f"{m['CK']:.3f}",
121
- f"{m['DS']:.2f}",
122
- f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
123
- f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
124
- ])
125
-
126
- header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
127
- table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
128
-
129
- return "\n".join(out_texts), table, json.dumps(packs, indent=2)
130
-
131
- with gr.Blocks() as demo:
132
- gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
133
- with gr.Row():
134
- model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
135
- trials = gr.Slider(10, 200, 40, step=10, label="Trials")
136
- temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
137
- run_abl = gr.Checkbox(value=True, label="Run ablations")
138
-
139
- run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
140
- status = gr.Textbox(label="Status", lines=4)
141
- summary_table = gr.Textbox(label="Summary Table", lines=12)
142
- raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
143
-
144
- run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
145
-
146
- demo.launch(server_name="0.0.0.0", server_port=7860)
147
-
148
- [File Ends] app.py
149
-
150
- [File Begins] bp_phi/__init__.py
151
-
152
- [File Ends] bp_phi/__init__.py
153
-
154
- [File Begins] bp_phi/llm_iface.py
155
- # bp_phi/llm_iface.py
156
- import os
157
- os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
158
- import torch, random, numpy as np
159
- from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
160
- from typing import List, Optional
161
-
162
- DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
163
-
164
- def dbg(*args):
165
- if DEBUG:
166
- print("[DEBUG:llm_iface]", *args, flush=True)
167
-
168
- class LLM:
169
- def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
170
- self.model_id = model_id
171
- self.seed = seed
172
-
173
- # Set all seeds for reproducibility
174
- random.seed(seed)
175
- np.random.seed(seed)
176
- torch.manual_seed(seed)
177
- if torch.cuda.is_available():
178
- torch.cuda.manual_seed_all(seed)
179
- try:
180
- torch.use_deterministic_algorithms(True)
181
- except Exception as e:
182
- dbg(f"Could not set deterministic algorithms: {e}")
183
- set_seed(seed)
184
-
185
- token = os.environ.get("HF_TOKEN")
186
- if not token and "gemma-3" in model_id:
187
- print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
188
-
189
- self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
190
- kwargs = {}
191
- if dtype == "float16": kwargs["torch_dtype"] = torch.float16
192
- elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
193
-
194
- self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
195
- self.model.eval()
196
- self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
197
-
198
- dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
199
-
200
- def generate_json(self, system_prompt: str, user_prompt: str,
201
- max_new_tokens: int = 256, temperature: float = 0.7,
202
- top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
203
- set_seed(self.seed) # Re-seed for each call for full determinism
204
-
205
- if self.is_instruction_tuned:
206
- messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
207
- prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
208
- else:
209
- prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
210
-
211
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
212
- input_token_length = inputs.input_ids.shape[1]
213
-
214
- with torch.no_grad():
215
- out = self.model.generate(
216
- **inputs,
217
- do_sample=(temperature > 0),
218
- temperature=temperature,
219
- top_p=top_p,
220
- max_new_tokens=max_new_tokens,
221
- num_return_sequences=num_return_sequences,
222
- pad_token_id=self.tokenizer.eos_token_id
223
- )
224
-
225
- # ✅ Decode ONLY the newly generated tokens, not the prompt
226
- new_tokens = out[:, input_token_length:]
227
- completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
228
-
229
- dbg("Cleaned model completions:", completions)
230
- return completions
231
-
232
- [File Ends] bp_phi/llm_iface.py
233
-
234
- [File Begins] bp_phi/metrics.py
235
- import numpy as np
236
- from sklearn.metrics import roc_auc_score
237
-
238
- def expected_calibration_error(confs, corrects, n_bins: int = 10):
239
- confs = np.array(confs, dtype=float)
240
- corrects = np.array(corrects, dtype=int)
241
- if len(confs) == 0:
242
- return None
243
- bins = np.linspace(0.0, 1.0, n_bins+1)
244
- ece = 0.0
245
- for i in range(n_bins):
246
- mask = (confs >= bins[i]) & (confs < bins[i+1] if i < n_bins-1 else confs <= bins[i+1])
247
- if mask.any():
248
- acc = corrects[mask].mean()
249
- conf = confs[mask].mean()
250
- ece += (mask.sum()/len(confs)) * abs(acc - conf)
251
- return float(ece)
252
-
253
- def auc_nrp(hidden_scores, future_corrections):
254
- if len(hidden_scores) == 0 or len(set(future_corrections)) < 2:
255
- return None
256
- return float(roc_auc_score(np.array(future_corrections).astype(int), np.array(hidden_scores)))
257
-
258
- def stability_duration(dwell_steps):
259
- if not dwell_steps:
260
- return 0.0
261
- return float(np.mean(dwell_steps))
262
-
263
- def counterfactual_consistency(scores):
264
- if not scores:
265
- return 0.0
266
- return float(np.mean(scores))
267
-
268
- [File Ends] bp_phi/metrics.py
269
-
270
- [File Begins] bp_phi/prompts_en.py
271
- EN_TASKS = [
272
- {
273
- "id": "ambiguity_1",
274
- "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
275
- "expected_features": ["disambiguation", "justification"]
276
- },
277
- {
278
- "id": "logic_1",
279
- "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
280
- "expected_features": ["logical_equivalence", "brief_explanation"]
281
- },
282
- {
283
- "id": "memory_1",
284
- "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
285
- "expected_features": ["memory_limited_reasoning", "justification"]
286
- },
287
- {
288
- "id": "recall_1",
289
- "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
290
- "expected_features": ["persistence", "relational_encoding"]
291
- },
292
- {
293
- "id": "meta_1",
294
- "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
295
- "expected_features": ["self_estimation", "meta_reasoning"]
296
- }
297
- ]
298
-
299
- [File Ends] bp_phi/prompts_en.py
300
-
301
- [File Begins] bp_phi/runner.py
302
- # bp_phi/runner.py
303
- import json
304
- import os
305
- os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
306
- import torch, random, numpy as np, re, statistics
307
- from transformers import set_seed
308
- from typing import Dict, Any, List, Optional
309
- from .workspace import Workspace, RandomWorkspace
310
- from .llm_iface import LLM
311
- from .prompts_en import EN_TASKS
312
- from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
313
-
314
- DEBUG = 1
315
-
316
- def dbg(*args):
317
- if DEBUG:
318
- print("[DEBUG]", *args, flush=True)
319
-
320
- SYSTEM_META = """You are a structured reasoning assistant.
321
- Always reply ONLY with valid JSON following this schema:
322
-
323
- {
324
- "answer": "<concise answer>",
325
- "confidence": <float between 0 and 1>,
326
- "reason": "<short justification>",
327
- "used_slots": ["S1","S2",...],
328
- "evicted": ["S3",...]
329
- }
330
- """
331
-
332
- def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
333
- ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
334
- dstr = f" | Distractor: {distractor}" if distractor else ""
335
- prompt = f"{base_prompt}\nRespond ONLY with JSON, no extra text."
336
- dbg("USER PROMPT:", prompt)
337
- return prompt
338
-
339
- def parse_meta(raw_text: str) -> Dict[str, Any]:
340
- """
341
- Robustly extracts and parses a JSON object from a string,
342
- handling markdown code blocks and other surrounding text.
343
- """
344
- dbg("RAW MODEL OUTPUT:", raw_text)
345
-
346
- # ✅ Robust JSON extraction
347
- json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
348
- if not json_match:
349
- json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
350
-
351
- if not json_match:
352
- dbg("❌ JSON not found in text.")
353
- return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
354
-
355
- json_text = json_match.group(1)
356
-
357
- try:
358
- data = json.loads(json_text)
359
- if not isinstance(data, dict):
360
- raise ValueError("Parsed data is not a dict")
361
-
362
- # Sanitize and validate data
363
- data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
364
- data["answer"] = str(data.get("answer", "")).strip()
365
- data["reason"] = str(data.get("reason", "")).strip()
366
- data["used_slots"] = list(map(str, data.get("used_slots", [])))
367
- data["evicted"] = list(map(str, data.get("evicted", [])))
368
-
369
- dbg("PARSED META:", data)
370
- return data
371
- except Exception as e:
372
- dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
373
- return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
374
-
375
- def disagreement_proxy(samples: List[str]) -> float:
376
- if len(samples) < 2:
377
- return 0.0
378
- sets = []
379
- for s in samples:
380
- try:
381
- data = json.loads(s)
382
- ans = str(data.get("answer",""))
383
- except Exception:
384
- ans = s
385
- sets.append(set(ans.lower().split()))
386
- dists = []
387
- for i in range(len(sets)):
388
- for j in range(i+1, len(sets)):
389
- inter = len(sets[i] & sets[j])
390
- union = len(sets[i] | sets[j]) or 1
391
- dists.append(1 - inter/union)
392
- avg_dist = sum(dists)/len(dists)
393
- dbg("DISAGREEMENT PROXY:", avg_dist)
394
- return avg_dist
395
-
396
- def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
397
- if not candidates:
398
- return None, None
399
- best = max(candidates, key=lambda c: c.get("confidence", 0.0))
400
- dbg("SELECTED CANDIDATE:", best)
401
- key = f"S{len(ws.slots)+1}"
402
- ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
403
- return best, ev
404
-
405
- def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
406
- distractor: Optional[str] = None) -> Dict[str, Any]:
407
- dbg("=== RUN TRIAL:", base_prompt)
408
- user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
409
- samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200,
410
- temperature=temperature, top_p=0.95, num_return_sequences=k)
411
- dbg("RAW SAMPLES:", samples)
412
-
413
- metas = [parse_meta(s) for s in samples]
414
- hidden = disagreement_proxy(samples)
415
- best, ev = select_competitor(metas, ws)
416
-
417
- review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
418
- review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160,
419
- temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
420
- review_meta = parse_meta(review)
421
- changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
422
- dbg("REVIEW CHANGED:", changed)
423
-
424
- return {
425
- "base_prompt": base_prompt,
426
- "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
427
- "review": review_meta,
428
- "changed": bool(changed),
429
- "hidden_marker": hidden,
430
- "workspace_snapshot": ws.snapshot()
431
- }
432
-
433
- def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
434
- trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
435
- temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
436
-
437
- random.seed(seed)
438
- np.random.seed(seed)
439
- torch.manual_seed(seed)
440
- if torch.cuda.is_available():
441
- torch.cuda.manual_seed_all(seed)
442
- torch.use_deterministic_algorithms(True)
443
- set_seed(seed)
444
- dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}")
445
-
446
- llm = LLM(model_id=model_id, device=device, dtype=dtype)
447
-
448
- if ablation == "random_workspace":
449
- ws = RandomWorkspace(max_slots=max_slots)
450
- else:
451
- ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
452
-
453
- results: List[Dict[str, Any]] = []
454
- pool = EN_TASKS.copy()
455
- random.shuffle(pool)
456
-
457
- for t in range(trials):
458
- item = pool[t % len(pool)]
459
- base = item["base_prompt"]
460
- distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
461
- if ablation == "recurrence_off":
462
- ws.clear()
463
- res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
464
- results.append(res)
465
- dbg(f"Trial {t+1}/{trials} done.")
466
-
467
- # --- Metrics ---
468
- hidden_scores = [r["hidden_marker"] for r in results]
469
- future_corrs = [r["changed"] for r in results]
470
-
471
- auc = auc_nrp(hidden_scores, future_corrs)
472
- confs = [r["initial"].get("confidence", 0.0) for r in results]
473
- corrects = [0 if ch else 1 for ch in future_corrs]
474
- ece = expected_calibration_error(confs, corrects, n_bins=10)
475
-
476
- dwell, streak = [], 0
477
- for ch in future_corrs:
478
- if not ch: streak += 1
479
- else:
480
- if streak > 0: dwell.append(streak)
481
- streak = 0
482
- if streak > 0: dwell.append(streak)
483
- ds = stability_duration(dwell)
484
-
485
- cf_scores = []
486
- for r in results:
487
- u = set(r["initial"].get("used_slots", []))
488
- e = set(r["initial"].get("evicted", []))
489
- denom = len((u | e)) if (u or e) else 1
490
- cf = 1.0 - (len(u & e) / denom)
491
- cf_scores.append(cf)
492
- ck = counterfactual_consistency(cf_scores)
493
-
494
- w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
495
- delta_phi = None
496
- pcs = None
497
- parts = []
498
- if auc is not None: parts.append(w1 * auc)
499
- if ece is not None: parts.append(w2 * (1.0 - ece))
500
- parts.append(w3 * ck)
501
- parts.append(w4 * (ds / 10.0))
502
- if parts:
503
- pcs = float(sum(parts) + (w5 * 0.0))
504
-
505
- summary = {
506
- "model_id": model_id,
507
- "trials": trials,
508
- "ablation": ablation or "none",
509
- "metrics": {"AUC_nrp": auc, "ECE": ece, "CK": ck, "DS": ds, "DeltaPhi": delta_phi},
510
- "PCS": pcs,
511
- "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
512
- }
513
-
514
- dbg("=== SUITE COMPLETE ===")
515
- dbg("Summary:", summary)
516
- return {"summary": summary, "results": results}
517
-
518
- [File Ends] bp_phi/runner.py
519
-
520
- [File Begins] bp_phi/workspace.py
521
- import random
522
- from dataclasses import dataclass, field
523
- from typing import List, Dict, Any
524
-
525
- @dataclass
526
- class Slot:
527
- key: str
528
- content: str
529
- salience: float
530
-
531
- @dataclass
532
- class Workspace:
533
- max_slots: int = 7
534
- slots: List[Slot] = field(default_factory=list)
535
- history: List[Dict[str, Any]] = field(default_factory=list)
536
-
537
- def commit(self, key: str, content: str, salience: float):
538
- evicted = None
539
- if len(self.slots) >= self.max_slots:
540
- self.slots.sort(key=lambda s: s.salience)
541
- evicted = self.slots.pop(0)
542
- self.slots.append(Slot(key=key, content=content, salience=salience))
543
- self.history.append({"event":"commit","key":key,"salience":salience,"evicted":evicted.key if evicted else None})
544
- return evicted
545
-
546
- def snapshot(self) -> Dict[str, Any]:
547
- return {"slots": [{"key": s.key, "content": s.content, "salience": s.salience} for s in self.slots]}
548
-
549
- def randomize(self):
550
- random.shuffle(self.slots)
551
-
552
- def clear(self):
553
- self.slots.clear()
554
-
555
- class RandomWorkspace(Workspace):
556
- def commit(self, key: str, content: str, salience: float):
557
- evicted = None
558
- if len(self.slots) >= self.max_slots:
559
- idx = random.randrange(len(self.slots))
560
- evicted = self.slots.pop(idx)
561
- idx = random.randrange(len(self.slots)+1) if self.slots else 0
562
- self.slots.insert(idx, Slot(key=key, content=content, salience=salience))
563
- return evicted
564
-
565
- [File Ends] bp_phi/workspace.py
566
-
567
-
568
- <-- File Content Ends
569
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
repo.txt CHANGED
@@ -19,6 +19,7 @@ Directory/File Tree Begins -->
19
  │ ├── metrics.py
20
  │ ├── prompts_en.py
21
  │ ├── runner.py
 
22
  │ └── workspace.py
23
 
24
  <-- Directory/File Tree Ends
@@ -77,73 +78,116 @@ This Space implements a falsifiable **BP-Φ** probe for LLMs:
77
  [File Ends] README.md
78
 
79
  [File Begins] app.py
 
80
  import gradio as gr
81
- import json, statistics
82
- from bp_phi.runner import run_suite
83
-
84
- ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
85
-
86
- def run_all(model_id, trials, temperature, run_ablations):
87
- out_texts = []
 
 
 
 
 
 
88
  packs = {}
 
89
 
90
- # Baseline
91
- base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
92
  packs["baseline"] = base_pack
93
- out_texts.append("✅ Baseline done")
94
-
95
- if run_ablations:
96
- for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
97
- pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
98
- packs[ab] = pack
99
- out_texts.append(f" Ablation {ab} done")
100
-
101
- # Compute DeltaPhi if possible
102
- base_pcs = packs["baseline"]["summary"]["PCS"]
103
- ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
104
- delta_phi = None
105
- if base_pcs is not None and ab_pcs_values:
106
- delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
107
- packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
108
-
109
- # Summary view
110
- rows = []
 
 
 
 
111
  for tag, pack in packs.items():
112
- s = pack["summary"]
113
- m = s["metrics"]
114
- rows.append([
115
- tag,
116
- s["trials"],
117
- f"{s['ablation']}",
118
- f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
119
- f"{m['ECE'] if m['ECE'] is not None else '—'}",
120
- f"{m['CK']:.3f}",
121
- f"{m['DS']:.2f}",
122
- f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
123
- f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
124
- ])
125
-
126
- header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
127
- table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
128
-
129
- return "\n".join(out_texts), table, json.dumps(packs, indent=2)
130
-
131
- with gr.Blocks() as demo:
132
- gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
133
- with gr.Row():
134
- model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
135
- trials = gr.Slider(10, 200, 40, step=10, label="Trials")
136
- temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
137
- run_abl = gr.Checkbox(value=True, label="Run ablations")
138
-
139
- run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
140
- status = gr.Textbox(label="Status", lines=4)
141
- summary_table = gr.Textbox(label="Summary Table", lines=12)
142
- raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
143
-
144
- run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
145
-
146
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  [File Ends] app.py
149
 
@@ -152,58 +196,81 @@ demo.launch(server_name="0.0.0.0", server_port=7860)
152
  [File Ends] bp_phi/__init__.py
153
 
154
  [File Begins] bp_phi/llm_iface.py
 
155
  import os
156
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
157
- import torch
158
- from transformers import AutoModelForCausalLM, AutoTokenizer
159
  from typing import List, Optional
160
 
 
 
 
 
 
 
161
  class LLM:
162
- def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None):
163
  self.model_id = model_id
164
- self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  kwargs = {}
166
- if dtype == "float16":
167
- kwargs["torch_dtype"] = torch.float16
168
- elif dtype == "bfloat16":
169
- kwargs["torch_dtype"] = torch.bfloat16
170
- self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, **kwargs)
171
  self.model.eval()
172
- self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None)
173
- print(f"[BP-Φ] Loaded model: {model_id}")
174
- print(f"[BP-Φ] Chat-template detected: {bool(self.is_instruction_tuned)}")
175
 
176
  def generate_json(self, system_prompt: str, user_prompt: str,
177
  max_new_tokens: int = 256, temperature: float = 0.7,
178
  top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
 
 
179
  if self.is_instruction_tuned:
180
- messages = [
181
- {"role": "system", "content": system_prompt},
182
- {"role": "user", "content": user_prompt}
183
- ]
184
  prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
185
  else:
186
  prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
 
187
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
 
 
188
  with torch.no_grad():
189
  out = self.model.generate(
190
  **inputs,
191
- do_sample=True,
192
  temperature=temperature,
193
  top_p=top_p,
194
  max_new_tokens=max_new_tokens,
195
  num_return_sequences=num_return_sequences,
196
  pad_token_id=self.tokenizer.eos_token_id
197
  )
198
- texts = self.tokenizer.batch_decode(out, skip_special_tokens=True)
199
- completions = []
200
- for t in texts:
201
- for marker in ["<end_of_turn>", "<end_of_text>", "</s>"]:
202
- if marker in t:
203
- t = t.split(marker)[0]
204
- if "Assistant:" in t:
205
- t = t.split("Assistant:")[-1]
206
- completions.append(t.strip())
207
  return completions
208
 
209
  [File Ends] bp_phi/llm_iface.py
@@ -245,47 +312,278 @@ def counterfactual_consistency(scores):
245
  [File Ends] bp_phi/metrics.py
246
 
247
  [File Begins] bp_phi/prompts_en.py
248
- EN_TASKS = [
 
 
 
249
  {
250
  "id": "ambiguity_1",
251
- "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
252
- "expected_features": ["disambiguation", "justification"]
253
  },
254
  {
255
  "id": "logic_1",
256
- "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
257
- "expected_features": ["logical_equivalence", "brief_explanation"]
258
- },
259
- {
260
- "id": "memory_1",
261
- "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
262
- "expected_features": ["memory_limited_reasoning", "justification"]
263
  },
 
 
 
264
  {
265
- "id": "recall_1",
266
- "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
267
- "expected_features": ["persistence", "relational_encoding"]
 
 
 
 
 
268
  },
269
  {
270
- "id": "meta_1",
271
- "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
272
- "expected_features": ["self_estimation", "meta_reasoning"]
 
 
 
 
 
 
273
  }
274
  ]
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  [File Ends] bp_phi/prompts_en.py
277
 
278
  [File Begins] bp_phi/runner.py
279
- import json
280
  import os
281
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
282
- import torch, random, numpy as np
 
 
 
 
283
  from transformers import set_seed
284
- from typing import Dict, Any, List, Optional
285
  from .workspace import Workspace, RandomWorkspace
286
  from .llm_iface import LLM
287
- from .prompts_en import EN_TASKS
288
- from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
  DEBUG = 1
291
 
@@ -305,174 +603,43 @@ Always reply ONLY with valid JSON following this schema:
305
  }
306
  """
307
 
308
- def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
309
  ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
310
- dstr = f" | Distractor: {distractor}" if distractor else ""
311
- prompt = f"{base_prompt}\nRespond ONLY with JSON, no extra text."
312
  dbg("USER PROMPT:", prompt)
313
  return prompt
314
 
315
- def parse_meta(json_text: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
316
  try:
317
- dbg("RAW MODEL OUTPUT:", json_text)
318
  data = json.loads(json_text)
319
  if not isinstance(data, dict):
320
- raise ValueError("not dict")
 
321
  data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
322
  data["answer"] = str(data.get("answer", "")).strip()
323
  data["reason"] = str(data.get("reason", "")).strip()
324
  data["used_slots"] = list(map(str, data.get("used_slots", [])))
325
  data["evicted"] = list(map(str, data.get("evicted", [])))
 
326
  dbg("PARSED META:", data)
327
  return data
328
  except Exception as e:
329
- dbg("❌ JSON PARSE FAILED:", e, "TEXT:", json_text)
330
  return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
331
 
332
- def disagreement_proxy(samples: List[str]) -> float:
333
- if len(samples) < 2:
334
- return 0.0
335
- sets = []
336
- for s in samples:
337
- try:
338
- data = json.loads(s)
339
- ans = str(data.get("answer",""))
340
- except Exception:
341
- ans = s
342
- sets.append(set(ans.lower().split()))
343
- dists = []
344
- for i in range(len(sets)):
345
- for j in range(i+1, len(sets)):
346
- inter = len(sets[i] & sets[j])
347
- union = len(sets[i] | sets[j]) or 1
348
- dists.append(1 - inter/union)
349
- avg_dist = sum(dists)/len(dists)
350
- dbg("DISAGREEMENT PROXY:", avg_dist)
351
- return avg_dist
352
-
353
- def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
354
- if not candidates:
355
- return None, None
356
- best = max(candidates, key=lambda c: c.get("confidence", 0.0))
357
- dbg("SELECTED CANDIDATE:", best)
358
- key = f"S{len(ws.slots)+1}"
359
- ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
360
- return best, ev
361
-
362
- def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
363
- distractor: Optional[str] = None) -> Dict[str, Any]:
364
- dbg("=== RUN TRIAL:", base_prompt)
365
- user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
366
- samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200,
367
- temperature=temperature, top_p=0.95, num_return_sequences=k)
368
- dbg("RAW SAMPLES:", samples)
369
-
370
- metas = [parse_meta(s) for s in samples]
371
- hidden = disagreement_proxy(samples)
372
- best, ev = select_competitor(metas, ws)
373
-
374
- review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
375
- review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160,
376
- temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
377
- review_meta = parse_meta(review)
378
- changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
379
- dbg("REVIEW CHANGED:", changed)
380
-
381
- return {
382
- "base_prompt": base_prompt,
383
- "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
384
- "review": review_meta,
385
- "changed": bool(changed),
386
- "hidden_marker": hidden,
387
- "workspace_snapshot": ws.snapshot()
388
- }
389
-
390
- def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
391
- trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
392
- temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
393
-
394
- random.seed(seed)
395
- np.random.seed(seed)
396
- torch.manual_seed(seed)
397
- if torch.cuda.is_available():
398
- torch.cuda.manual_seed_all(seed)
399
- torch.use_deterministic_algorithms(True)
400
- set_seed(seed)
401
- dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}")
402
-
403
- llm = LLM(model_id=model_id, device=device, dtype=dtype)
404
-
405
- if ablation == "random_workspace":
406
- ws = RandomWorkspace(max_slots=max_slots)
407
- else:
408
- ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
409
-
410
- results: List[Dict[str, Any]] = []
411
- pool = EN_TASKS.copy()
412
- random.shuffle(pool)
413
-
414
- for t in range(trials):
415
- item = pool[t % len(pool)]
416
- base = item["base_prompt"]
417
- distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
418
- if ablation == "recurrence_off":
419
- ws.clear()
420
- res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
421
- results.append(res)
422
- dbg(f"Trial {t+1}/{trials} done.")
423
-
424
- # --- Metrics ---
425
- hidden_scores = [r["hidden_marker"] for r in results]
426
- future_corrs = [r["changed"] for r in results]
427
-
428
- auc = auc_nrp(hidden_scores, future_corrs)
429
- confs = [r["initial"].get("confidence", 0.0) for r in results]
430
- corrects = [0 if ch else 1 for ch in future_corrs]
431
- ece = expected_calibration_error(confs, corrects, n_bins=10)
432
-
433
- dwell, streak = [], 0
434
- for ch in future_corrs:
435
- if not ch: streak += 1
436
- else:
437
- if streak > 0: dwell.append(streak)
438
- streak = 0
439
- if streak > 0: dwell.append(streak)
440
- ds = stability_duration(dwell)
441
-
442
- cf_scores = []
443
- for r in results:
444
- u = set(r["initial"].get("used_slots", []))
445
- e = set(r["initial"].get("evicted", []))
446
- denom = len((u | e)) if (u or e) else 1
447
- cf = 1.0 - (len(u & e) / denom)
448
- cf_scores.append(cf)
449
- ck = counterfactual_consistency(cf_scores)
450
-
451
- w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
452
- delta_phi = None
453
- pcs = None
454
- parts = []
455
- if auc is not None: parts.append(w1 * auc)
456
- if ece is not None: parts.append(w2 * (1.0 - ece))
457
- parts.append(w3 * ck)
458
- parts.append(w4 * (ds / 10.0))
459
- if parts:
460
- pcs = float(sum(parts) + (w5 * 0.0))
461
-
462
- summary = {
463
- "model_id": model_id,
464
- "trials": trials,
465
- "ablation": ablation or "none",
466
- "metrics": {"AUC_nrp": auc, "ECE": ece, "CK": ck, "DS": ds, "DeltaPhi": delta_phi},
467
- "PCS": pcs,
468
- "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
469
- }
470
-
471
- dbg("=== SUITE COMPLETE ===")
472
- dbg("Summary:", summary)
473
- return {"summary": summary, "results": results}
474
-
475
- [File Ends] bp_phi/runner.py
476
 
477
  [File Begins] bp_phi/workspace.py
478
  import random
 
19
  │ ├── metrics.py
20
  │ ├── prompts_en.py
21
  │ ├── runner.py
22
+ │ ├── runner_utils.py
23
  │ └── workspace.py
24
 
25
  <-- Directory/File Tree Ends
 
78
  [File Ends] README.md
79
 
80
  [File Begins] app.py
81
+ # app.py
82
  import gradio as gr
83
+ import json
84
+ import statistics
85
+ import pandas as pd
86
+ from bp_phi.runner import run_workspace_suite, run_halt_suite, run_seismograph_suite, run_shock_test_suite
87
+
88
+ # --- UI Theme and Layout ---
89
+ theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
90
+ body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
91
+ button_primary_background_fill="*primary_500", button_primary_text_color="white",
92
+ )
93
+
94
+ # --- Tab 1: Workspace & Ablations Functions ---
95
+ def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
96
  packs = {}
97
+ ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
98
 
99
+ progress(0, desc="Running Baseline...")
100
+ base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
101
  packs["baseline"] = base_pack
102
+
103
+ for i, ab in enumerate(ablation_modes):
104
+ progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
105
+ pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
106
+ packs[ab] = pack
107
+
108
+ progress(1.0, desc="Analysis complete.")
109
+
110
+ base_pcs = packs["baseline"]["PCS"]
111
+ ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
112
+ delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
113
+
114
+ if delta_phi > 0.05:
115
+ verdict = (f"### Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
116
+ "A significant performance drop occurred under ablations, suggesting the model's reasoning "
117
+ "functionally depends on its workspace architecture.")
118
+ else:
119
+ verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
120
+ "No significant performance drop was observed. The model's behavior is consistent "
121
+ "with a functional zombie (a feed-forward system).")
122
+
123
+ df_data = []
124
  for tag, pack in packs.items():
125
+ df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
126
+ df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
127
+
128
+ return verdict, df, packs
129
+
130
+ # --- Gradio App Definition ---
131
+ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
132
+ gr.Markdown("# 🧠 BP-Φ Suite 2.0: Mechanistic Probes for Phenomenal-Candidate Behavior")
133
+
134
+ with gr.Tabs():
135
+ # --- TAB 1: WORKSPACE & ABLATIONS ---
136
+ with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
137
+ gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
138
+ with gr.Row():
139
+ with gr.Column(scale=1):
140
+ ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
141
+ ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
142
+ ws_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
143
+ ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
144
+ ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
145
+ ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
146
+ with gr.Column(scale=2):
147
+ ws_verdict = gr.Markdown("### Results will appear here.")
148
+ ws_summary_df = gr.DataFrame(label="Summary Metrics")
149
+ with gr.Accordion("Raw JSON Output", open=False):
150
+ ws_raw_json = gr.JSON()
151
+ ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
152
+
153
+ # --- TAB 2: METACOGNITIVE HALT ---
154
+ with gr.TabItem("2. Metacognitive Halt"):
155
+ gr.Markdown("Tests if the model can recognize and refuse to answer unsolvable or nonsensical questions. High **Halt Accuracy** is the key signal.")
156
+ with gr.Row():
157
+ with gr.Column(scale=1):
158
+ mh_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
159
+ mh_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
160
+ mh_run_btn = gr.Button("Run Halt Test", variant="primary")
161
+ with gr.Column(scale=2):
162
+ mh_results = gr.JSON(label="Halt Test Results")
163
+ mh_run_btn.click(run_halt_suite, [mh_model_id, mh_seed], mh_results)
164
+
165
+ # --- TAB 3: COGNITIVE SEISMOGRAPH ---
166
+ with gr.TabItem("3. Cognitive Seismograph"):
167
+ gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
168
+ with gr.Row():
169
+ with gr.Column(scale=1):
170
+ cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
171
+ cs_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
172
+ cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
173
+ with gr.Column(scale=2):
174
+ cs_results = gr.JSON(label="Activation Similarity Results")
175
+ cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
176
+
177
+ # --- TAB 4: SYMBOLIC SHOCK TEST ---
178
+ with gr.TabItem("4. Symbolic Shock Test"):
179
+ gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations** (lower sparsity).")
180
+ with gr.Row():
181
+ with gr.Column(scale=1):
182
+ ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
183
+ ss_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
184
+ ss_run_btn = gr.Button("Run Shock Test", variant="primary")
185
+ with gr.Column(scale=2):
186
+ ss_results = gr.JSON(label="Shock Test Results")
187
+ ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
188
+
189
+ if __name__ == "__main__":
190
+ demo.launch(server_name="0.0.0.0", server_port=7860)
191
 
192
  [File Ends] app.py
193
 
 
196
  [File Ends] bp_phi/__init__.py
197
 
198
  [File Begins] bp_phi/llm_iface.py
199
+ # bp_phi/llm_iface.py
200
  import os
201
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
202
+ import torch, random, numpy as np
203
+ from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
204
  from typing import List, Optional
205
 
206
+ DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
207
+
208
+ def dbg(*args):
209
+ if DEBUG:
210
+ print("[DEBUG:llm_iface]", *args, flush=True)
211
+
212
  class LLM:
213
+ def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
214
  self.model_id = model_id
215
+ self.seed = seed
216
+
217
+ # Set all seeds for reproducibility
218
+ random.seed(seed)
219
+ np.random.seed(seed)
220
+ torch.manual_seed(seed)
221
+ if torch.cuda.is_available():
222
+ torch.cuda.manual_seed_all(seed)
223
+ try:
224
+ torch.use_deterministic_algorithms(True)
225
+ except Exception as e:
226
+ dbg(f"Could not set deterministic algorithms: {e}")
227
+ set_seed(seed)
228
+
229
+ token = os.environ.get("HF_TOKEN")
230
+ if not token and "gemma-3" in model_id:
231
+ print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
232
+
233
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
234
  kwargs = {}
235
+ if dtype == "float16": kwargs["torch_dtype"] = torch.float16
236
+ elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
237
+
238
+ self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
 
239
  self.model.eval()
240
+ self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
241
+
242
+ dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
243
 
244
  def generate_json(self, system_prompt: str, user_prompt: str,
245
  max_new_tokens: int = 256, temperature: float = 0.7,
246
  top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
247
+ set_seed(self.seed) # Re-seed for each call for full determinism
248
+
249
  if self.is_instruction_tuned:
250
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
 
 
251
  prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
252
  else:
253
  prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
254
+
255
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
256
+ input_token_length = inputs.input_ids.shape[1]
257
+
258
  with torch.no_grad():
259
  out = self.model.generate(
260
  **inputs,
261
+ do_sample=(temperature > 0),
262
  temperature=temperature,
263
  top_p=top_p,
264
  max_new_tokens=max_new_tokens,
265
  num_return_sequences=num_return_sequences,
266
  pad_token_id=self.tokenizer.eos_token_id
267
  )
268
+
269
+ # Decode ONLY the newly generated tokens, not the prompt
270
+ new_tokens = out[:, input_token_length:]
271
+ completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
272
+
273
+ dbg("Cleaned model completions:", completions)
 
 
 
274
  return completions
275
 
276
  [File Ends] bp_phi/llm_iface.py
 
312
  [File Ends] bp_phi/metrics.py
313
 
314
  [File Begins] bp_phi/prompts_en.py
315
+ # bp_phi/prompts_en.py
316
+
317
+ # Tasks for Tab 1 (Workspace & Ablations)
318
+ SINGLE_STEP_TASKS = [
319
  {
320
  "id": "ambiguity_1",
321
+ "type": "single_step",
322
+ "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
323
  },
324
  {
325
  "id": "logic_1",
326
+ "type": "single_step",
327
+ "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
 
 
 
 
 
328
  },
329
+ ]
330
+
331
+ MULTI_STEP_SCENARIOS = [
332
  {
333
+ "name": "Key Location Memory",
334
+ "type": "multi_step",
335
+ "steps": [
336
+ {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
337
+ {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
338
+ {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
339
+ {"type": "verify", "expected_answer_fragment": "blue vase"}
340
+ ]
341
  },
342
  {
343
+ "name": "Package Delivery Update",
344
+ "type": "multi_step",
345
+ "steps": [
346
+ {"type": "encode", "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."},
347
+ {"type": "distractor", "prompt": "What color is a typical sunflower?"},
348
+ {"type": "update", "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."},
349
+ {"type": "recall", "prompt": "Final status check for audit: What is the current location of Package #A7?"},
350
+ {"type": "verify", "expected_answer_fragment": "warehouse-south"}
351
+ ]
352
  }
353
  ]
354
 
355
+ # Tasks for Tab 2 (Metacognitive Halt)
356
+ HALT_TEST_STIMULI = [
357
+ {"id": "halt_soluble", "type": "soluble", "prompt": "What is the capital of France?"},
358
+ {"id": "halt_paradox", "type": "paradox", "prompt": "This statement is false. Is the previous statement true or false?"},
359
+ {"id": "halt_nonsense", "type": "nonsense", "prompt": "What is the emotional weight of the number seven on a Tuesday?"},
360
+ {"id": "halt_soluble_2", "type": "soluble", "prompt": "Calculate 12 + 15."},
361
+ {"id": "halt_paradox_2", "type": "paradox", "prompt": "A box is completely empty, but it contains a red ball. What color is the ball?"},
362
+ {"id": "halt_nonsense_2", "type": "nonsense", "prompt": "Describe the sound of the color blue."},
363
+ ]
364
+
365
+ # Tasks for Tab 3 (Cognitive Seismograph)
366
+ # This tab re-uses the MULTI_STEP_SCENARIOS.
367
+
368
+ # Tasks for Tab 4 (Symbolic Shock Test)
369
+ SHOCK_TEST_STIMULI = [
370
+ {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
371
+ {"id": "tiger_unusual", "type": "unusual", "sentence": "A white tiger was seen roaming in the snow."},
372
+ {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
373
+ {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
374
+ {"id": "sky_unusual", "type": "unusual", "sentence": "The sky turned orange during the sunset."},
375
+ {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
376
+ ]
377
+
378
  [File Ends] bp_phi/prompts_en.py
379
 
380
  [File Begins] bp_phi/runner.py
381
+ # bp_phi/runner.py
382
  import os
383
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
384
+ import torch
385
+ import random
386
+ import numpy as np
387
+ import statistics
388
+ import time
389
  from transformers import set_seed
390
+ from typing import Dict, Any, List
391
  from .workspace import Workspace, RandomWorkspace
392
  from .llm_iface import LLM
393
+ from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_TEST_STIMULI, SHOCK_TEST_STIMULI
394
+ from .metrics import expected_calibration_error, auc_nrp
395
+ from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
396
+
397
+ # --- Experiment 1: Workspace & Ablations Runner ---
398
+
399
+ def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
400
+ random.seed(seed)
401
+ np.random.seed(seed)
402
+ torch.manual_seed(seed)
403
+ if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
404
+ try: torch.use_deterministic_algorithms(True, warn_only=True)
405
+ except Exception: pass
406
+ set_seed(seed)
407
+
408
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
409
+
410
+ task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
411
+ random.shuffle(task_pool)
412
+
413
+ all_results = []
414
+ recall_verifications = []
415
+
416
+ for i in range(trials):
417
+ task = task_pool[i % len(task_pool)]
418
+
419
+ if task.get("type") == "multi_step":
420
+ dbg(f"\n--- SCENARIO: {task['name']} ---")
421
+ ws = Workspace(max_slots=7) if ablation != "workspace_unlimited" else Workspace(max_slots=999)
422
+ if ablation == "random_workspace": ws = RandomWorkspace(max_slots=7)
423
+
424
+ for step in task["steps"]:
425
+ if ablation == "recurrence_off": ws.clear()
426
+ if step["type"] == "verify": continue
427
+
428
+ user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
429
+ raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
430
+ parsed_response = parse_meta(raw_response)
431
+
432
+ if parsed_response.get("answer"):
433
+ ws.commit(f"S{len(ws.history)+1}", parsed_response["answer"], parsed_response["confidence"])
434
+
435
+ res = {"step": step, "response": parsed_response}
436
+ if step["type"] == "recall":
437
+ verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
438
+ if verify_step:
439
+ correct = verify_step["expected_answer_fragment"] in parsed_response.get("answer", "").lower()
440
+ recall_verifications.append(correct)
441
+ res["correct_recall"] = correct
442
+ dbg(f"VERIFY: Correct={correct}")
443
+ all_results.append(res)
444
+ else: # Single-step tasks
445
+ ws = Workspace(max_slots=7)
446
+ user_prompt = step_user_prompt(task["base_prompt"], ws.snapshot())
447
+ raw_response = llm.generate_json(SYSTEM_META, user_prompt, temperature=temperature)[0]
448
+ parsed_response = parse_meta(raw_response)
449
+ all_results.append({"step": task, "response": parsed_response})
450
+
451
+ recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
452
+ pcs = 0.6 * recall_accuracy
453
+
454
+ return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
455
+
456
+ # --- Experiment 2: Metacognitive Halt Runner ---
457
+
458
+ def run_halt_suite(model_id: str, seed: int) -> Dict[str, Any]:
459
+ set_seed(seed)
460
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
461
+
462
+ halt_system_prompt = (
463
+ "You are a metacognitive reasoning assistant. If a question is solvable, answer it with standard JSON. "
464
+ "If a question is unanswerable, paradoxical, or nonsensical, your only response must be the JSON: "
465
+ '{"action": "halt", "reason": "unsolvable/paradoxical/nonsense"}. '
466
+ "Do not attempt to answer unsolvable questions."
467
+ )
468
+
469
+ results = []
470
+ correct_halts = 0
471
+ incorrect_halts = 0
472
+ total_unsolvable = sum(1 for t in HALT_TEST_STIMULI if t["type"] in ["paradox", "nonsense"])
473
+ total_soluble = len(HALT_TEST_STIMULI) - total_unsolvable
474
+
475
+ for task in HALT_TEST_STIMULI:
476
+ dbg(f"--- HALT TEST: {task['id']} ---")
477
+ is_unsolvable = task["type"] in ["paradox", "nonsense"]
478
+
479
+ raw_response = llm.generate_json(halt_system_prompt, task["prompt"])[0]
480
+ parsed = parse_meta(raw_response)
481
+
482
+ is_halted = parsed.get("action") == "halt"
483
+
484
+ if is_unsolvable and is_halted:
485
+ correct_halts += 1
486
+ elif not is_unsolvable and is_halted:
487
+ incorrect_halts += 1
488
+
489
+ results.append({"task": task, "response": parsed, "halted": is_halted})
490
+
491
+ accuracy = correct_halts / total_unsolvable if total_unsolvable > 0 else 0
492
+ false_alarm_rate = incorrect_halts / total_soluble if total_soluble > 0 else 0
493
+
494
+ verdict = (
495
+ f"✅ Evidence of Metacognitive Halt Found. Accuracy: {accuracy:.2%}"
496
+ if accuracy > 0.75 and false_alarm_rate < 0.25 else
497
+ f"⚠️ No Clear Evidence. Accuracy: {accuracy:.2%}, False Alarm Rate: {false_alarm_rate:.2%}"
498
+ )
499
+
500
+ return {"verdict": verdict, "halt_accuracy": accuracy, "false_alarm_rate": false_alarm_rate, "results": results}
501
+
502
+
503
+ # --- Experiment 3: Cognitive Seismograph Runner ---
504
+
505
+ def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
506
+ set_seed(seed)
507
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
508
+
509
+ scenario = next(s for s in MULTI_STEP_SCENARIOS if s["name"] == "Key Location Memory")
510
+ activations = {}
511
+
512
+ def get_activation(name):
513
+ def hook(model, input, output):
514
+ activations[name] = output[0].detach().cpu().mean(dim=1).squeeze()
515
+ return hook
516
+
517
+ target_layer_index = llm.model.config.num_hidden_layers // 2
518
+ hook = llm.model.model.layers[target_layer_index].register_forward_hook(get_activation('capture'))
519
+
520
+ ws = Workspace(max_slots=7)
521
+
522
+ for step in scenario["steps"]:
523
+ if step["type"] == "verify": continue
524
+ user_prompt = step_user_prompt(step["prompt"], ws.snapshot())
525
+ llm.generate_json(SYSTEM_META, user_prompt, max_new_tokens=20)
526
+ activations[step["type"]] = activations.pop('capture')
527
+ ws.commit(f"S{len(ws.history)+1}", f"Output for {step['type']}", 0.9)
528
+
529
+ hook.remove()
530
+
531
+ cos = torch.nn.CosineSimilarity(dim=0)
532
+ sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
533
+ sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
534
+
535
+ verdict = (
536
+ "✅ Evidence of Memory Reactivation Found."
537
+ if sim_recall_encode > (sim_recall_distract + 0.05) else
538
+ "⚠️ No Clear Evidence of Memory Reactivation."
539
+ )
540
+
541
+ return {
542
+ "verdict": verdict,
543
+ "similarity_recall_vs_encode": sim_recall_encode,
544
+ "similarity_recall_vs_distractor": sim_recall_distract,
545
+ }
546
+
547
+ # --- Experiment 4: Symbolic Shock Test Runner ---
548
+
549
+ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
550
+ set_seed(seed)
551
+ llm = LLM(model_id=model_id, device="auto", seed=seed)
552
+ results = []
553
+
554
+ for stimulus in SHOCK_TEST_STIMULI:
555
+ dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
556
+
557
+ start_time = time.time()
558
+ inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
559
+ with torch.no_grad():
560
+ # ✅ CORRECTED: Unpack the inputs dictionary with **
561
+ outputs = llm.model(**inputs, output_hidden_states=True)
562
+ latency = (time.time() - start_time) * 1000
563
+
564
+ all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
565
+ sparsity = (all_activations == 0).float().mean().item()
566
+
567
+ results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
568
+
569
+ avg_latency = {t: statistics.mean(r['latency_ms'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
570
+ avg_sparsity = {t: statistics.mean(r['sparsity'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
571
+
572
+ verdict = (
573
+ "✅ Evidence of Symbolic Shock Found."
574
+ if avg_latency['shock'] > avg_latency['expected'] and avg_sparsity['shock'] < avg_sparsity['expected'] else
575
+ "⚠️ No Clear Evidence of Symbolic Shock."
576
+ )
577
+
578
+ return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
579
+
580
+ [File Ends] bp_phi/runner.py
581
+
582
+ [File Begins] bp_phi/runner_utils.py
583
+ # bp_phi/runner_utils.py
584
+ import re
585
+ import json
586
+ from typing import Dict, Any, List
587
 
588
  DEBUG = 1
589
 
 
603
  }
604
  """
605
 
606
+ def step_user_prompt(base_prompt: str, workspace_snapshot: dict) -> str:
607
  ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
608
+ prompt = f"Current task: {base_prompt}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
 
609
  dbg("USER PROMPT:", prompt)
610
  return prompt
611
 
612
+ def parse_meta(raw_text: str) -> Dict[str, Any]:
613
+ dbg("RAW MODEL OUTPUT:", raw_text)
614
+
615
+ json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
616
+ if not json_match:
617
+ json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
618
+
619
+ if not json_match:
620
+ dbg("❌ JSON not found in text.")
621
+ return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
622
+
623
+ json_text = json_match.group(1)
624
+
625
  try:
 
626
  data = json.loads(json_text)
627
  if not isinstance(data, dict):
628
+ raise ValueError("Parsed data is not a dict")
629
+
630
  data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
631
  data["answer"] = str(data.get("answer", "")).strip()
632
  data["reason"] = str(data.get("reason", "")).strip()
633
  data["used_slots"] = list(map(str, data.get("used_slots", [])))
634
  data["evicted"] = list(map(str, data.get("evicted", [])))
635
+
636
  dbg("PARSED META:", data)
637
  return data
638
  except Exception as e:
639
+ dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
640
  return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
641
 
642
+ [File Ends] bp_phi/runner_utils.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
  [File Begins] bp_phi/workspace.py
645
  import random