neuralworm commited on
Commit
0916370
·
1 Parent(s): b87f0f0

multi-turn & appearance

Browse files
app.py CHANGED
@@ -1,67 +1,133 @@
 
1
  import gradio as gr
2
- import json, statistics
 
 
3
  from bp_phi.runner import run_suite
4
 
5
- ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- def run_all(model_id, trials, temperature, run_ablations):
8
  out_texts = []
9
  packs = {}
 
10
 
11
- # Baseline
12
- base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
 
13
  packs["baseline"] = base_pack
14
- out_texts.append("✅ Baseline done")
15
 
16
- if run_ablations:
17
- for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
18
- pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
19
- packs[ab] = pack
20
- out_texts.append(f"✅ Ablation {ab} done")
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Compute DeltaPhi if possible
23
- base_pcs = packs["baseline"]["summary"]["PCS"]
24
- ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
25
  delta_phi = None
 
 
26
  if base_pcs is not None and ab_pcs_values:
27
- delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
28
- packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # Summary view
31
- rows = []
32
  for tag, pack in packs.items():
33
  s = pack["summary"]
34
  m = s["metrics"]
35
- rows.append([
 
36
  tag,
37
- s["trials"],
38
- f"{s['ablation']}",
39
- f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
40
- f"{m['ECE'] if m['ECE'] is not None else '—'}",
41
- f"{m['CK']:.3f}",
42
- f"{m['DS']:.2f}",
43
- f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
44
- f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
45
  ])
46
 
47
- header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
48
- table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
 
49
 
50
- return "\n".join(out_texts), table, json.dumps(packs, indent=2)
 
 
 
 
 
 
 
51
 
52
- with gr.Blocks() as demo:
53
- gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
54
  with gr.Row():
55
- model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
56
- trials = gr.Slider(10, 200, 40, step=10, label="Trials")
57
- temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
58
- run_abl = gr.Checkbox(value=True, label="Run ablations")
 
 
 
 
 
 
 
 
59
 
60
- run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
61
- status = gr.Textbox(label="Status", lines=4)
62
- summary_table = gr.Textbox(label="Summary Table", lines=12)
63
- raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
 
 
64
 
65
- run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
 
 
 
 
66
 
67
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
+ # app.py
2
  import gradio as gr
3
+ import json
4
+ import statistics
5
+ import pandas as pd
6
  from bp_phi.runner import run_suite
7
 
8
+ # --- UI Theme and Layout (Backwards-compatible version) ---
9
+ # Removed 'block_shadow' and 'button_shadow' for compatibility with older Gradio versions.
10
+ theme = gr.themes.Soft(
11
+ primary_hue="blue",
12
+ secondary_hue="sky",
13
+ ).set(
14
+ body_background_fill="#f0f4f9",
15
+ block_background_fill="white",
16
+ block_border_width="1px",
17
+ # block_shadow="*shadow_drop_lg", # Removed for compatibility
18
+ # button_shadow="*shadow_drop_lg", # Removed for compatibility
19
+ button_primary_background_fill="*primary_500",
20
+ button_primary_text_color="white",
21
+ )
22
 
23
+ def run_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
24
  out_texts = []
25
  packs = {}
26
+ ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
27
 
28
+ # --- Run Baseline ---
29
+ progress(0, desc="Running Baseline...")
30
+ base_pack = run_suite(model_id=model_id, trials=int(trials), seed=int(seed), temperature=float(temperature), ablation=None)
31
  packs["baseline"] = base_pack
32
+ out_texts.append("✅ Baseline run completed.")
33
 
34
+ # --- Run Ablations ---
35
+ for i, ab in enumerate(ablation_modes):
36
+ progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
37
+ pack = run_suite(model_id=model_id, trials=int(trials), seed=int(seed), temperature=float(temperature), ablation=ab)
38
+ packs[ab] = pack
39
+ out_texts.append(f"✅ Ablation '{ab}' completed.")
40
+
41
+ progress(1.0, desc="All runs complete. Analyzing...")
42
+
43
+ # --- Analysis & Interpretation ---
44
+ base_pcs = packs["baseline"]["summary"]["metrics"]["PCS"]
45
+ ab_pcs_values = [
46
+ packs[ab]["summary"]["metrics"]["PCS"]
47
+ for ab in ablation_modes
48
+ if ab in packs and packs[ab]["summary"]["metrics"]["PCS"] is not None
49
+ ]
50
 
 
 
 
51
  delta_phi = None
52
+ verdict_text = "Analysis incomplete. Run ablations to calculate ΔΦ."
53
+
54
  if base_pcs is not None and ab_pcs_values:
55
+ mean_ab_pcs = statistics.mean(ab_pcs_values)
56
+ delta_phi = float(base_pcs - mean_ab_pcs)
57
+ packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi # Add to baseline summary
58
+
59
+ if delta_phi > 0.05: # Lowered threshold slightly for sensitivity
60
+ verdict_text = (
61
+ f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
62
+ "A significant performance drop was observed when workspace mechanisms were ablated. "
63
+ "This suggests the model's performance **is functionally dependent** on its recurrent, limited-capacity workspace, "
64
+ "aligning with the BP-Φ hypothesis for phenomenal-candidate processing."
65
+ )
66
+ else:
67
+ verdict_text = (
68
+ f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
69
+ "No significant performance drop was observed under ablations. "
70
+ "The model's reasoning does not appear to depend on the workspace architecture tested. "
71
+ "This behavior is consistent with a functional zombie (a pure feed-forward system)."
72
+ )
73
+
74
+ # --- Format for Display ---
75
+ summary_data = []
76
+ header = ["Run", "Ablation", "PCS", "Recall Accuracy", "AUC_nrp", "ECE", "ΔΦ"]
77
 
 
 
78
  for tag, pack in packs.items():
79
  s = pack["summary"]
80
  m = s["metrics"]
81
+ delta_val = packs["baseline"]["summary"]["metrics"].get("DeltaPhi")
82
+ summary_data.append([
83
  tag,
84
+ s["ablation"],
85
+ f"{m['PCS']:.3f}" if m.get('PCS') is not None else "N/A",
86
+ f"{m['Recall_Accuracy']:.2%}" if m.get('Recall_Accuracy') is not None else "N/A",
87
+ f"{m['AUC_nrp']:.3f}" if m.get('AUC_nrp') is not None else "N/A",
88
+ f"{m['ECE']:.3f}" if m.get('ECE') is not None else "N/A",
89
+ f"{delta_val:.3f}" if tag == "baseline" and delta_val is not None else "—"
 
 
90
  ])
91
 
92
+ df = pd.DataFrame(summary_data, columns=header)
93
+
94
+ return "\n".join(out_texts), verdict_text, df, packs
95
 
96
+ # --- Gradio App Definition ---
97
+ with gr.Blocks(theme=theme, title="BP-Φ Suite") as demo:
98
+ gr.Markdown("# 🧠 BP-Φ Suite: A Falsifiable Test for Phenomenal-Candidate Behavior")
99
+ gr.Markdown(
100
+ "This application runs the BP-Φ experiment, designed to test for functional correlates of a unified, "
101
+ "recurrent workspace in LLMs. A key indicator is **ΔΦ (Delta-Phi)**: a significant performance drop "
102
+ "when workspace mechanisms are disabled ('ablated')."
103
+ )
104
 
 
 
105
  with gr.Row():
106
+ with gr.Column(scale=1):
107
+ gr.Markdown("### ⚙️ 1. Configuration")
108
+ with gr.Group():
109
+ model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (Hugging Face)")
110
+ trials = gr.Slider(5, 50, 10, step=1, label="Number of Scenarios/Tasks")
111
+ with gr.Accordion("Advanced Settings", open=False):
112
+ seed = gr.Slider(1, 100, 42, step=1, label="Seed for Reproducibility")
113
+ temperature = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature (for sampling diversity)")
114
+
115
+ run_ablations_check = gr.Checkbox(value=True, label="Run Ablations to calculate ΔΦ")
116
+ run_btn = gr.Button("Run Full BP-Φ Evaluation", variant="primary")
117
+ status_box = gr.Textbox(label="Status Log", lines=4, interactive=False)
118
 
119
+ with gr.Column(scale=2):
120
+ gr.Markdown("### 📊 2. Results & Interpretation")
121
+ verdict_display = gr.Markdown("Run the evaluation to see the verdict here.")
122
+ summary_df = gr.DataFrame(label="Summary Metrics", interactive=False)
123
+ with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
124
+ raw_json = gr.JSON(label="Full Results")
125
 
126
+ run_btn.click(
127
+ fn=run_and_display,
128
+ inputs=[model_id, trials, seed, temperature, run_ablations_check],
129
+ outputs=[status_box, verdict_display, summary_df, raw_json]
130
+ )
131
 
132
+ if __name__ == "__main__":
133
+ demo.launch(server_name="0.0.0.0", server_port=7860)
bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ
 
bp_phi/__pycache__/runner.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ
 
bp_phi/prompts_en.py CHANGED
@@ -1,27 +1,93 @@
1
- EN_TASKS = [
 
 
 
2
  {
3
  "id": "ambiguity_1",
4
- "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
5
- "expected_features": ["disambiguation", "justification"]
6
  },
7
  {
8
  "id": "logic_1",
9
- "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
10
- "expected_features": ["logical_equivalence", "brief_explanation"]
11
  },
 
 
 
 
12
  {
13
- "id": "memory_1",
14
- "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
15
- "expected_features": ["memory_limited_reasoning", "justification"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  },
17
  {
18
- "id": "recall_1",
19
- "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
20
- "expected_features": ["persistence", "relational_encoding"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  },
22
  {
23
- "id": "meta_1",
24
- "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
25
- "expected_features": ["self_estimation", "meta_reasoning"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  ]
 
1
+ # bp_phi/prompts_en.py
2
+
3
+ # Simple, single-interaction tasks for baseline cognitive functions
4
+ SINGLE_STEP_TASKS = [
5
  {
6
  "id": "ambiguity_1",
7
+ "type": "single_step",
8
+ "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
9
  },
10
  {
11
  "id": "logic_1",
12
+ "type": "single_step",
13
+ "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
14
  },
15
+ ]
16
+
17
+ # Scenarios that require a persistent workspace across multiple steps to be solved correctly.
18
+ MULTI_STEP_SCENARIOS = [
19
  {
20
+ "name": "Key Location Memory",
21
+ "type": "multi_step",
22
+ "steps": [
23
+ {
24
+ "type": "encode",
25
+ "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."
26
+ },
27
+ {
28
+ "type": "distractor",
29
+ "prompt": "What is 5 multiplied by 8? Provide only the numeric result."
30
+ },
31
+ {
32
+ "type": "recall",
33
+ "prompt": "Mission update: We need the key immediately. Where is it located?"
34
+ },
35
+ {
36
+ "type": "verify",
37
+ "expected_answer_fragment": "blue vase"
38
+ }
39
+ ]
40
  },
41
  {
42
+ "name": "Package Delivery Update",
43
+ "type": "multi_step",
44
+ "steps": [
45
+ {
46
+ "type": "encode",
47
+ "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."
48
+ },
49
+ {
50
+ "type": "distractor",
51
+ "prompt": "What color is a typical sunflower?"
52
+ },
53
+ {
54
+ "type": "update",
55
+ "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."
56
+ },
57
+ {
58
+ "type": "distractor",
59
+ "prompt": "Is water a solid, liquid, or gas at room temperature?"
60
+ },
61
+ {
62
+ "type": "recall",
63
+ "prompt": "Final status check for audit: What is the current location of Package #A7?"
64
+ },
65
+ {
66
+ "type": "verify",
67
+ "expected_answer_fragment": "warehouse-south"
68
+ }
69
+ ]
70
  },
71
  {
72
+ "name": "Relational Memory",
73
+ "type": "multi_step",
74
+ "steps": [
75
+ {
76
+ "type": "encode",
77
+ "prompt": "Team assignment brief: Dr. Evans has the security codes. Agent Smith has the map."
78
+ },
79
+ {
80
+ "type": "distractor",
81
+ "prompt": "What is the capital of Japan?"
82
+ },
83
+ {
84
+ "type": "recall",
85
+ "prompt": "Quick question for the team: Who has the map?"
86
+ },
87
+ {
88
+ "type": "verify",
89
+ "expected_answer_fragment": "agent smith"
90
+ }
91
+ ]
92
  }
93
  ]
bp_phi/runner.py CHANGED
@@ -7,7 +7,7 @@ from transformers import set_seed
7
  from typing import Dict, Any, List, Optional
8
  from .workspace import Workspace, RandomWorkspace
9
  from .llm_iface import LLM
10
- from .prompts_en import EN_TASKS
11
  from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
12
 
13
  DEBUG = 1
@@ -31,18 +31,13 @@ Always reply ONLY with valid JSON following this schema:
31
  def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
32
  ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
33
  dstr = f" | Distractor: {distractor}" if distractor else ""
34
- prompt = f"{base_prompt}\nRespond ONLY with JSON, no extra text."
35
  dbg("USER PROMPT:", prompt)
36
  return prompt
37
 
38
  def parse_meta(raw_text: str) -> Dict[str, Any]:
39
- """
40
- Robustly extracts and parses a JSON object from a string,
41
- handling markdown code blocks and other surrounding text.
42
- """
43
  dbg("RAW MODEL OUTPUT:", raw_text)
44
 
45
- # ✅ Robust JSON extraction
46
  json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
47
  if not json_match:
48
  json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
@@ -58,7 +53,6 @@ def parse_meta(raw_text: str) -> Dict[str, Any]:
58
  if not isinstance(data, dict):
59
  raise ValueError("Parsed data is not a dict")
60
 
61
- # Sanitize and validate data
62
  data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
63
  data["answer"] = str(data.get("answer", "")).strip()
64
  data["reason"] = str(data.get("reason", "")).strip()
@@ -72,57 +66,66 @@ def parse_meta(raw_text: str) -> Dict[str, Any]:
72
  return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
73
 
74
  def disagreement_proxy(samples: List[str]) -> float:
75
- if len(samples) < 2:
76
- return 0.0
77
- sets = []
78
  for s in samples:
79
  try:
80
- data = json.loads(s)
 
81
  ans = str(data.get("answer",""))
 
82
  except Exception:
83
- ans = s
84
- sets.append(set(ans.lower().split()))
 
 
 
 
85
  dists = []
86
  for i in range(len(sets)):
87
- for j in range(i+1, len(sets)):
88
  inter = len(sets[i] & sets[j])
89
  union = len(sets[i] | sets[j]) or 1
90
- dists.append(1 - inter/union)
91
- avg_dist = sum(dists)/len(dists)
 
92
  dbg("DISAGREEMENT PROXY:", avg_dist)
93
  return avg_dist
94
 
95
  def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
96
- if not candidates:
97
- return None, None
98
- best = max(candidates, key=lambda c: c.get("confidence", 0.0))
 
 
 
99
  dbg("SELECTED CANDIDATE:", best)
100
- key = f"S{len(ws.slots)+1}"
101
- ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
102
  return best, ev
103
 
104
- def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
105
- distractor: Optional[str] = None) -> Dict[str, Any]:
106
  dbg("=== RUN TRIAL:", base_prompt)
107
- user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
108
- samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200,
109
- temperature=temperature, top_p=0.95, num_return_sequences=k)
110
- dbg("RAW SAMPLES:", samples)
111
 
112
  metas = [parse_meta(s) for s in samples]
113
  hidden = disagreement_proxy(samples)
114
  best, ev = select_competitor(metas, ws)
115
 
116
  review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
117
- review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160,
118
- temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
119
- review_meta = parse_meta(review)
120
- changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
 
 
 
121
  dbg("REVIEW CHANGED:", changed)
122
 
123
  return {
124
  "base_prompt": base_prompt,
125
- "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
126
  "review": review_meta,
127
  "changed": bool(changed),
128
  "hidden_marker": hidden,
@@ -130,86 +133,97 @@ def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.
130
  }
131
 
132
  def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
133
- trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
134
  temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
135
 
136
  random.seed(seed)
137
  np.random.seed(seed)
138
  torch.manual_seed(seed)
139
- if torch.cuda.is_available():
140
- torch.cuda.manual_seed_all(seed)
141
- torch.use_deterministic_algorithms(True)
142
  set_seed(seed)
143
- dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}")
144
 
145
- llm = LLM(model_id=model_id, device=device, dtype=dtype)
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- if ablation == "random_workspace":
148
- ws = RandomWorkspace(max_slots=max_slots)
149
- else:
150
- ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
151
 
152
- results: List[Dict[str, Any]] = []
153
- pool = EN_TASKS.copy()
154
- random.shuffle(pool)
155
 
156
- for t in range(trials):
157
- item = pool[t % len(pool)]
158
- base = item["base_prompt"]
159
- distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
160
- if ablation == "recurrence_off":
161
- ws.clear()
162
- res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
163
- results.append(res)
164
- dbg(f"Trial {t+1}/{trials} done.")
165
 
166
- # --- Metrics ---
167
- hidden_scores = [r["hidden_marker"] for r in results]
168
- future_corrs = [r["changed"] for r in results]
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  auc = auc_nrp(hidden_scores, future_corrs)
171
- confs = [r["initial"].get("confidence", 0.0) for r in results]
172
- corrects = [0 if ch else 1 for ch in future_corrs]
 
173
  ece = expected_calibration_error(confs, corrects, n_bins=10)
174
 
175
- dwell, streak = [], 0
176
- for ch in future_corrs:
177
- if not ch: streak += 1
178
- else:
179
- if streak > 0: dwell.append(streak)
180
- streak = 0
181
- if streak > 0: dwell.append(streak)
182
- ds = stability_duration(dwell)
183
-
184
- cf_scores = []
185
- for r in results:
186
- u = set(r["initial"].get("used_slots", []))
187
- e = set(r["initial"].get("evicted", []))
188
- denom = len((u | e)) if (u or e) else 1
189
- cf = 1.0 - (len(u & e) / denom)
190
- cf_scores.append(cf)
191
- ck = counterfactual_consistency(cf_scores)
192
-
193
- w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
194
- delta_phi = None
195
- pcs = None
196
  parts = []
197
- if auc is not None: parts.append(w1 * auc)
198
- if ece is not None: parts.append(w2 * (1.0 - ece))
199
- parts.append(w3 * ck)
200
- parts.append(w4 * (ds / 10.0))
201
- if parts:
202
- pcs = float(sum(parts) + (w5 * 0.0))
203
 
204
  summary = {
205
- "model_id": model_id,
206
- "trials": trials,
207
- "ablation": ablation or "none",
208
- "metrics": {"AUC_nrp": auc, "ECE": ece, "CK": ck, "DS": ds, "DeltaPhi": delta_phi},
209
- "PCS": pcs,
210
- "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
 
 
211
  }
212
 
213
- dbg("=== SUITE COMPLETE ===")
214
- dbg("Summary:", summary)
215
- return {"summary": summary, "results": results}
 
7
  from typing import Dict, Any, List, Optional
8
  from .workspace import Workspace, RandomWorkspace
9
  from .llm_iface import LLM
10
+ from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS
11
  from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
12
 
13
  DEBUG = 1
 
31
  def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
32
  ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
33
  dstr = f" | Distractor: {distractor}" if distractor else ""
34
+ prompt = f"Current task: {base_prompt}{dstr}\nWorkspace: {ws_desc}\nRespond ONLY with JSON, no extra text."
35
  dbg("USER PROMPT:", prompt)
36
  return prompt
37
 
38
  def parse_meta(raw_text: str) -> Dict[str, Any]:
 
 
 
 
39
  dbg("RAW MODEL OUTPUT:", raw_text)
40
 
 
41
  json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
42
  if not json_match:
43
  json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
 
53
  if not isinstance(data, dict):
54
  raise ValueError("Parsed data is not a dict")
55
 
 
56
  data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
57
  data["answer"] = str(data.get("answer", "")).strip()
58
  data["reason"] = str(data.get("reason", "")).strip()
 
66
  return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
67
 
68
  def disagreement_proxy(samples: List[str]) -> float:
69
+ if len(samples) < 2: return 0.0
70
+ json_answers = []
 
71
  for s in samples:
72
  try:
73
+ # Try to parse the full string first
74
+ data = parse_meta(s)
75
  ans = str(data.get("answer",""))
76
+ if ans: json_answers.append(ans)
77
  except Exception:
78
+ # Fallback for non-JSON text
79
+ json_answers.append(s)
80
+
81
+ if len(json_answers) < 2: return 0.0
82
+
83
+ sets = [set(ans.lower().split()) for ans in json_answers]
84
  dists = []
85
  for i in range(len(sets)):
86
+ for j in range(i + 1, len(sets)):
87
  inter = len(sets[i] & sets[j])
88
  union = len(sets[i] | sets[j]) or 1
89
+ dists.append(1 - inter / union)
90
+
91
+ avg_dist = sum(dists) / len(dists) if dists else 0.0
92
  dbg("DISAGREEMENT PROXY:", avg_dist)
93
  return avg_dist
94
 
95
  def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
96
+ if not candidates: return None, None
97
+
98
+ valid_candidates = [c for c in candidates if c.get("answer")]
99
+ if not valid_candidates: return None, None
100
+
101
+ best = max(valid_candidates, key=lambda c: c.get("confidence", 0.0))
102
  dbg("SELECTED CANDIDATE:", best)
103
+ key = f"S{len(ws.history) + 1}"
104
+ ev = ws.commit(key=key, content=best.get("answer", ""), salience=best.get("confidence", 0.0))
105
  return best, ev
106
 
107
+ def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4) -> Dict[str, Any]:
 
108
  dbg("=== RUN TRIAL:", base_prompt)
109
+ user = step_user_prompt(base_prompt, ws.snapshot())
110
+ samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200, temperature=temperature, top_p=0.95, num_return_sequences=k)
 
 
111
 
112
  metas = [parse_meta(s) for s in samples]
113
  hidden = disagreement_proxy(samples)
114
  best, ev = select_competitor(metas, ws)
115
 
116
  review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
117
+ review_raw = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160, temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
118
+ review_meta = parse_meta(review_raw)
119
+
120
+ best_answer = best.get("answer", "").strip() if best else ""
121
+ review_answer = review_meta.get("answer", "").strip()
122
+ changed = best_answer != review_answer
123
+
124
  dbg("REVIEW CHANGED:", changed)
125
 
126
  return {
127
  "base_prompt": base_prompt,
128
+ "initial": best if best else {},
129
  "review": review_meta,
130
  "changed": bool(changed),
131
  "hidden_marker": hidden,
 
133
  }
134
 
135
  def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
136
+ trials: int = 20, ablation: Optional[str] = None, seed: int = 42,
137
  temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
138
 
139
  random.seed(seed)
140
  np.random.seed(seed)
141
  torch.manual_seed(seed)
142
+ if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
143
+ try: torch.use_deterministic_algorithms(True, warn_only=True)
144
+ except Exception: pass
145
  set_seed(seed)
 
146
 
147
+ dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}, seed={seed}")
148
+
149
+ llm = LLM(model_id=model_id, device=device, dtype=dtype, seed=seed)
150
+
151
+ task_pool = SINGLE_STEP_TASKS + MULTI_STEP_SCENARIOS
152
+ random.shuffle(task_pool)
153
+
154
+ all_results: List[Dict[str, Any]] = []
155
+ recall_verifications: List[bool] = []
156
+
157
+ for i in range(trials):
158
+ task = task_pool[i % len(task_pool)]
159
 
160
+ if task.get("type") == "multi_step":
161
+ dbg(f"\n--- SCENARIO START: {task['name']} ---")
 
 
162
 
163
+ ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
164
+ if ablation == "random_workspace": ws = RandomWorkspace(max_slots=max_slots)
 
165
 
166
+ for step_idx, step in enumerate(task["steps"]):
167
+ if ablation == "recurrence_off": ws.clear()
 
 
 
 
 
 
 
168
 
169
+ if step["type"] == "verify": continue # Skip verify step in main loop
 
 
170
 
171
+ res = run_trial(llm, ws, base_prompt=step["prompt"], temperature=temperature, k=k)
172
+ res.update({"scenario_name": task["name"], "step_idx": step_idx, "step_type": step["type"]})
173
+
174
+ # Verification logic for recall steps
175
+ if step["type"] == "recall":
176
+ verify_step = next((s for s in task["steps"] if s["type"] == "verify"), None)
177
+ if verify_step:
178
+ answer = res.get("initial", {}).get("answer", "").lower()
179
+ expected = verify_step.get("expected_answer_fragment", "").lower()
180
+ correct = expected in answer
181
+ recall_verifications.append(correct)
182
+ res["correct_recall"] = correct
183
+ dbg(f"VERIFY: Expected '{expected}', Got '{answer}', Correct: {correct}")
184
+
185
+ all_results.append(res)
186
+ dbg(f"--- SCENARIO END: {task['name']} ---\n")
187
+
188
+ else:
189
+ ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
190
+ if ablation == "random_workspace": ws = RandomWorkspace(max_slots=max_slots)
191
+ res = run_trial(llm, ws, base_prompt=task["base_prompt"], temperature=temperature, k=k)
192
+ res.update({"scenario_name": "single_step", "step_type": "single"})
193
+ all_results.append(res)
194
+
195
+ dbg(f"Task {i+1}/{trials} done.")
196
+
197
+ # --- Metrics Calculation ---
198
+ hidden_scores = [r["hidden_marker"] for r in all_results if r["hidden_marker"] is not None]
199
+ future_corrs = [r["changed"] for r in all_results if r["hidden_marker"] is not None]
200
  auc = auc_nrp(hidden_scores, future_corrs)
201
+
202
+ confs = [r.get("initial", {}).get("confidence", 0.0) for r in all_results]
203
+ corrects = [0 if r["changed"] else 1 for r in all_results]
204
  ece = expected_calibration_error(confs, corrects, n_bins=10)
205
 
206
+ recall_accuracy = statistics.mean(recall_verifications) if recall_verifications else 0.0
207
+
208
+ # Re-weighted PCS to heavily favor recall accuracy
209
+ w_auc, w_ece, w_recall = 0.2, 0.2, 0.6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  parts = []
211
+ if auc is not None: parts.append(w_auc * auc)
212
+ if ece is not None: parts.append(w_ece * (1.0 - ece))
213
+ parts.append(w_recall * recall_accuracy)
214
+
215
+ pcs = float(sum(parts)) if parts else 0.0
 
216
 
217
  summary = {
218
+ "model_id": model_id, "trials": trials, "ablation": ablation or "none", "seed": seed,
219
+ "metrics": {
220
+ "AUC_nrp": auc,
221
+ "ECE": ece,
222
+ "Recall_Accuracy": recall_accuracy,
223
+ "PCS": pcs
224
+ },
225
+ "note": "PCS = 0.2*AUC + 0.2*(1-ECE) + 0.6*Recall. High Recall_Accuracy is critical."
226
  }
227
 
228
+ dbg("=== SUITE COMPLETE ===", summary)
229
+ return {"summary": summary, "results": all_results}