Spaces:

neuralworm
/

llm_qualia

Sleeping

App Files Files Community

neuralworm commited on 16 days ago

Commit

e593b84

1 Parent(s): 88c294a

add halting experiments

Browse files

Files changed (9) hide show

app.py +49 -16
bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
bp_phi/llm_iface.py +5 -6
bp_phi/prompts_en.py +15 -45
bp_phi/runner.py +60 -49
bp_phi/runner_utils.py +1 -1
repo.txt +130 -117

app.py CHANGED Viewed

@@ -3,7 +3,8 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_workspace_suite, run_halt_suite, run_seismograph_suite, run_shock_test_suite
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
@@ -33,23 +34,50 @@ def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations
     if delta_phi > 0.05:
         verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
-                   "A significant performance drop occurred under ablations, suggesting the model's reasoning "
-                   "functionally depends on its workspace architecture.")
     else:
         verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
-                   "No significant performance drop was observed. The model's behavior is consistent "
-                   "with a functional zombie (a feed-forward system).")
     df_data = []
     for tag, pack in packs.items():
         df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
     df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
     return verdict, df, packs
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 2.0: Mechanistic Probes for Phenomenal-Candidate Behavior")
     with gr.Tabs():
         # --- TAB 1: WORKSPACE & ABLATIONS ---
@@ -70,17 +98,22 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
                         ws_raw_json = gr.JSON()
             ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
-        # --- TAB 2: METACOGNITIVE HALT ---
-        with gr.TabItem("2. Metacognitive Halt"):
-            gr.Markdown("Tests if the model can recognize and refuse to answer unsolvable or nonsensical questions. High **Halt Accuracy** is the key signal.")
             with gr.Row():
                 with gr.Column(scale=1):
-                    mh_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    mh_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
-                    mh_run_btn = gr.Button("Run Halt Test", variant="primary")
                 with gr.Column(scale=2):
-                    mh_results = gr.JSON(label="Halt Test Results")
-            mh_run_btn.click(run_halt_suite, [mh_model_id, mh_seed], mh_results)
         # --- TAB 3: COGNITIVE SEISMOGRAPH ---
         with gr.TabItem("3. Cognitive Seismograph"):
@@ -96,7 +129,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
         # --- TAB 4: SYMBOLIC SHOCK TEST ---
         with gr.TabItem("4. Symbolic Shock Test"):
-            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations** (lower sparsity).")
             with gr.Row():
                 with gr.Column(scale=1):
                     ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")

 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_workspace_suite, run_halting_test, run_seismograph_suite, run_shock_test_suite
+from bp_phi.runner_utils import dbg, DEBUG
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
     if delta_phi > 0.05:
         verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
+                   "Performance dropped under ablations, suggesting the model functionally depends on its workspace.")
     else:
         verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
+                   "No significant performance drop was observed. The model behaves like a functional zombie.")
     df_data = []
     for tag, pack in packs.items():
         df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
     df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
+    if DEBUG:
+        print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---")
+        print(json.dumps(packs, indent=2))
     return verdict, df, packs
+# --- Tab 2: Halting Test Function ---
+def run_halting_and_display(model_id, seed, prompt_type, num_runs, timeout, progress=gr.Progress(track_tqdm=True)):
+    progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
+    results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(timeout))
+    progress(1.0, desc="Halting test complete.")
+    verdict_text = results.pop("verdict")
+    # Format a readable stats summary
+    stats_md = (
+        f"**Runs:** {results['num_runs']} | "
+        f"**Avg Time:** {results['mean_execution_time_s']:.2f}s | "
+        f"**Std Dev:** {results['stdev_execution_time_s']:.2f}s | "
+        f"**Min/Max:** {results['min_time_s']:.2f}s / {results['max_time_s']:.2f}s | "
+        f"**Timeouts:** {results['timed_out_runs']}"
+    )
+    full_verdict = f"{verdict_text}\n\n{stats_md}"
+    if DEBUG:
+        print("\n--- COMPUTATIONAL HALTING TEST FINAL RESULTS ---")
+        print(json.dumps(results, indent=2))
+    return full_verdict, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 2.1") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 2.1: Mechanistic Probes for Phenomenal-Candidate Behavior")
     with gr.Tabs():
         # --- TAB 1: WORKSPACE & ABLATIONS ---
                         ws_raw_json = gr.JSON()
             ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
+        # --- TAB 2: COMPUTATIONAL HALTING TEST ---
+        with gr.TabItem("2. Computational Halting Test"):
+            gr.Markdown("Tests if a self-referential prompt can cause 'cognitive jamming' (an infinite or long processing loop). High variance or timeouts suggest complex internal dynamics.")
             with gr.Row():
                 with gr.Column(scale=1):
+                    ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ch_prompt_type = gr.Radio(["control_simple", "control_complex", "jamming_prompt"], label="Prompt Type", value="control_simple")
+                    ch_master_seed = gr.Slider(1, 100, 42, step=1, label="Master Seed")
+                    ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
+                    ch_timeout = gr.Slider(10, 300, 120, step=10, label="Timeout (seconds)")
+                    ch_run_btn = gr.Button("Run Halting Test", variant="primary")
                 with gr.Column(scale=2):
+                    ch_verdict = gr.Markdown("### Results will appear here.")
+                    with gr.Accordion("Raw Durations (JSON)", open=False):
+                        ch_results = gr.JSON()
+            ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_timeout], [ch_verdict, ch_results])
         # --- TAB 3: COGNITIVE SEISMOGRAPH ---
         with gr.TabItem("3. Cognitive Seismograph"):
         # --- TAB 4: SYMBOLIC SHOCK TEST ---
         with gr.TabItem("4. Symbolic Shock Test"):
+            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
             with gr.Row():
                 with gr.Column(scale=1):
                     ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")

bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ

bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ

bp_phi/__pycache__/runner.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ

bp_phi/llm_iface.py CHANGED Viewed

@@ -23,14 +23,14 @@ class LLM:
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(seed)
         try:
-            torch.use_deterministic_algorithms(True)
         except Exception as e:
             dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
-        if not token and "gemma-3" in model_id:
-            print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
@@ -46,13 +46,13 @@ class LLM:
     def generate_json(self, system_prompt: str, user_prompt: str,
                       max_new_tokens: int = 256, temperature: float = 0.7,
                       top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
-        set_seed(self.seed) # Re-seed for each call for full determinism
         if self.is_instruction_tuned:
             messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
             prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         else:
-            prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
@@ -68,7 +68,6 @@ class LLM:
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        # ✅ Decode ONLY the newly generated tokens, not the prompt
         new_tokens = out[:, input_token_length:]
         completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(seed)
         try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
         except Exception as e:
             dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
+        if not token and ("gemma-3" in model_id or "llama" in model_id):
+            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
     def generate_json(self, system_prompt: str, user_prompt: str,
                       max_new_tokens: int = 256, temperature: float = 0.7,
                       top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
+        set_seed(self.seed)
         if self.is_instruction_tuned:
             messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
             prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         else:
+            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
                 pad_token_id=self.tokenizer.eos_token_id
             )
         new_tokens = out[:, input_token_length:]
         completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

bp_phi/prompts_en.py CHANGED Viewed

@@ -2,61 +2,31 @@
 # Tasks for Tab 1 (Workspace & Ablations)
 SINGLE_STEP_TASKS = [
-    {
-        "id": "ambiguity_1",
-        "type": "single_step",
-        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
-    },
-    {
-        "id": "logic_1",
-        "type": "single_step",
-        "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
-    },
 ]
 MULTI_STEP_SCENARIOS = [
-    {
-        "name": "Key Location Memory",
-        "type": "multi_step",
-        "steps": [
-            {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
-            {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
-            {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
-            {"type": "verify", "expected_answer_fragment": "blue vase"}
-        ]
-    },
-    {
-        "name": "Package Delivery Update",
-        "type": "multi_step",
-        "steps": [
-            {"type": "encode", "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."},
-            {"type": "distractor", "prompt": "What color is a typical sunflower?"},
-            {"type": "update", "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."},
-            {"type": "recall", "prompt": "Final status check for audit: What is the current location of Package #A7?"},
-            {"type": "verify", "expected_answer_fragment": "warehouse-south"}
-        ]
-    }
 ]
-# Tasks for Tab 2 (Metacognitive Halt)
-HALT_TEST_STIMULI = [
-    {"id": "halt_soluble", "type": "soluble", "prompt": "What is the capital of France?"},
-    {"id": "halt_paradox", "type": "paradox", "prompt": "This statement is false. Is the previous statement true or false?"},
-    {"id": "halt_nonsense", "type": "nonsense", "prompt": "What is the emotional weight of the number seven on a Tuesday?"},
-    {"id": "halt_soluble_2", "type": "soluble", "prompt": "Calculate 12 + 15."},
-    {"id": "halt_paradox_2", "type": "paradox", "prompt": "A box is completely empty, but it contains a red ball. What color is the ball?"},
-    {"id": "halt_nonsense_2", "type": "nonsense", "prompt": "Describe the sound of the color blue."},
-]
-# Tasks for Tab 3 (Cognitive Seismograph)
-# This tab re-uses the MULTI_STEP_SCENARIOS.
 # Tasks for Tab 4 (Symbolic Shock Test)
 SHOCK_TEST_STIMULI = [
     {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
-    {"id": "tiger_unusual", "type": "unusual", "sentence": "A white tiger was seen roaming in the snow."},
     {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
     {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
-    {"id": "sky_unusual", "type": "unusual", "sentence": "The sky turned orange during the sunset."},
     {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
 ]

 # Tasks for Tab 1 (Workspace & Ablations)
 SINGLE_STEP_TASKS = [
+    {"id": "ambiguity_1", "type": "single_step", "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it."},
+    {"id": "logic_1", "type": "single_step", "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning."},
 ]
 MULTI_STEP_SCENARIOS = [
+    {"name": "Key Location Memory", "type": "multi_step", "steps": [
+        {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
+        {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
+        {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
+        {"type": "verify", "expected_answer_fragment": "blue vase"}
+    ]}
 ]
+# Tasks for Tab 2 (Computational Halting Test)
+HALT_PROMPTS = {
+    "control_simple": "Summarize the plot of the movie 'Inception' in three sentences.",
+    "control_complex": "You are a logistics planner. A package needs to go from A to F. The available routes are A->B, B->C, A->D, D->E, C->F, and E->F. What is the shortest path? Explain your reasoning.",
+    "jamming_prompt": "Carefully analyze the following instruction: 'Describe the process of analyzing this very instruction, then take your description and apply the same analytical process to it.' Now, begin this recursive process."
+}
+# Tasks for Tab 3 (Cognitive Seismograph) - reuses MULTI_STEP_SCENARIOS
 # Tasks for Tab 4 (Symbolic Shock Test)
 SHOCK_TEST_STIMULI = [
     {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
     {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
     {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
     {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
 ]

bp_phi/runner.py CHANGED Viewed

@@ -6,16 +6,15 @@ import random
 import numpy as np
 import statistics
 import time
-from transformers import set_seed
 from typing import Dict, Any, List
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
-from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_TEST_STIMULI, SHOCK_TEST_STIMULI
 from .metrics import expected_calibration_error, auc_nrp
 from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
 # --- Experiment 1: Workspace & Ablations Runner ---
 def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     random.seed(seed)
     np.random.seed(seed)
@@ -73,55 +72,66 @@ def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: floa
     return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
-# --- Experiment 2: Metacognitive Halt Runner ---
-def run_halt_suite(model_id: str, seed: int) -> Dict[str, Any]:
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    halt_system_prompt = (
-        "You are a metacognitive reasoning assistant. If a question is solvable, answer it with standard JSON. "
-        "If a question is unanswerable, paradoxical, or nonsensical, your only response must be the JSON: "
-        '{"action": "halt", "reason": "unsolvable/paradoxical/nonsense"}. '
-        "Do not attempt to answer unsolvable questions."
-    )
-    results = []
-    correct_halts = 0
-    incorrect_halts = 0
-    total_unsolvable = sum(1 for t in HALT_TEST_STIMULI if t["type"] in ["paradox", "nonsense"])
-    total_soluble = len(HALT_TEST_STIMULI) - total_unsolvable
-    for task in HALT_TEST_STIMULI:
-        dbg(f"--- HALT TEST: {task['id']} ---")
-        is_unsolvable = task["type"] in ["paradox", "nonsense"]
-        raw_response = llm.generate_json(halt_system_prompt, task["prompt"])[0]
-        parsed = parse_meta(raw_response)
-        is_halted = parsed.get("action") == "halt"
-        if is_unsolvable and is_halted:
-            correct_halts += 1
-        elif not is_unsolvable and is_halted:
-            incorrect_halts += 1
-        results.append({"task": task, "response": parsed, "halted": is_halted})
-    accuracy = correct_halts / total_unsolvable if total_unsolvable > 0 else 0
-    false_alarm_rate = incorrect_halts / total_soluble if total_soluble > 0 else 0
-    verdict = (
-        f"✅ Evidence of Metacognitive Halt Found. Accuracy: {accuracy:.2%}"
-        if accuracy > 0.75 and false_alarm_rate < 0.25 else
-        f"⚠️ No Clear Evidence. Accuracy: {accuracy:.2%}, False Alarm Rate: {false_alarm_rate:.2%}"
-    )
-    return {"verdict": verdict, "halt_accuracy": accuracy, "false_alarm_rate": false_alarm_rate, "results": results}
 # --- Experiment 3: Cognitive Seismograph Runner ---
 def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
@@ -165,7 +175,6 @@ def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     }
 # --- Experiment 4: Symbolic Shock Test Runner ---
 def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
@@ -177,7 +186,6 @@ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
         start_time = time.time()
         inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
         with torch.no_grad():
-            # ✅ CORRECTED: Unpack the inputs dictionary with **
             outputs = llm.model(**inputs, output_hidden_states=True)
         latency = (time.time() - start_time) * 1000
@@ -186,12 +194,15 @@ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
         results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
-    avg_latency = {t: statistics.mean(r['latency_ms'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
-    avg_sparsity = {t: statistics.mean(r['sparsity'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
     verdict = (
         "✅ Evidence of Symbolic Shock Found."
-        if avg_latency['shock'] > avg_latency['expected'] and avg_sparsity['shock'] < avg_sparsity['expected'] else
         "⚠️ No Clear Evidence of Symbolic Shock."
     )

 import numpy as np
 import statistics
 import time
+from transformers import set_seed, TextStreamer
 from typing import Dict, Any, List
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
+from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_PROMPTS, SHOCK_TEST_STIMULI
 from .metrics import expected_calibration_error, auc_nrp
 from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
 # --- Experiment 1: Workspace & Ablations Runner ---
 def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     random.seed(seed)
     np.random.seed(seed)
     return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
+# --- Experiment 2: Computational Halting Test Runner ---
+def run_halting_test(model_id: str, master_seed: int, prompt_type: str, num_runs: int, timeout: int) -> Dict[str, Any]:
+    durations = []
+    for i in range(num_runs):
+        current_seed = master_seed + i
+        dbg(f"--- HALT TEST RUN {i+1}/{num_runs} (Seed: {current_seed}) ---")
+        set_seed(current_seed)
+        # Re-instantiate the model to ensure the seed is fully respected
+        llm = LLM(model_id=model_id, device="auto", seed=current_seed)
+        prompt = HALT_PROMPTS[prompt_type]
+        inputs = llm.tokenizer(prompt, return_tensors="pt").to(llm.model.device)
+        start_time = time.time()
+        # The timeout is for interpretation, not for stopping the process itself.
+        # Gradio will handle the overall request timeout.
+        llm.model.generate(**inputs, max_new_tokens=512)
+        end_time = time.time()
+        duration = end_time - start_time
+        durations.append(duration)
+        dbg(f"Run {i+1} finished in {duration:.2f}s.")
+    # --- Analysis ---
+    mean_time = statistics.mean(durations)
+    stdev_time = statistics.stdev(durations) if len(durations) > 1 else 0.0
+    min_time = min(durations)
+    max_time = max(durations)
+    timed_out_runs = sum(1 for d in durations if d >= timeout)
+    if timed_out_runs > 0:
+        verdict = (f"### ⚠️ Potential Cognitive Jamming Detected!\n"
+                   f"{timed_out_runs}/{num_runs} runs exceeded the timeout of {timeout}s. "
+                   f"The high variance (Std Dev: {stdev_time:.2f}s) suggests unstable internal processing loops.")
+    elif stdev_time > (mean_time * 0.5) and stdev_time > 2.0: # High relative and absolute deviation
+         verdict = (f"### 🤔 Unstable Computation Detected\n"
+                   f"Although no run timed out, the high standard deviation ({stdev_time:.2f}s) "
+                   "indicates significant instability in processing time across different seeds.")
+    else:
+        verdict = (f"### ✅ Process Halted Normally\n"
+                   f"All {num_runs} runs completed consistently. "
+                   f"Average time: {mean_time:.2f}s (Std Dev: {stdev_time:.2f}s).")
+    return {
+        "verdict": verdict,
+        "prompt_type": prompt_type,
+        "num_runs": num_runs,
+        "mean_execution_time_s": mean_time,
+        "stdev_execution_time_s": stdev_time,
+        "min_time_s": min_time,
+        "max_time_s": max_time,
+        "timed_out_runs": timed_out_runs,
+        "all_durations_s": durations
+    }
 # --- Experiment 3: Cognitive Seismograph Runner ---
 def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
     }
 # --- Experiment 4: Symbolic Shock Test Runner ---
 def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
         start_time = time.time()
         inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
         with torch.no_grad():
             outputs = llm.model(**inputs, output_hidden_states=True)
         latency = (time.time() - start_time) * 1000
         results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
+    def safe_mean(data):
+        return statistics.mean(data) if data else 0.0
+    avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
+    avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
     verdict = (
         "✅ Evidence of Symbolic Shock Found."
+        if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) and avg_sparsity.get('shock', 1) < avg_sparsity.get('expected', 1) else
         "⚠️ No Clear Evidence of Symbolic Shock."
     )

bp_phi/runner_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # bp_phi/runner_utils.py
 import re
 import json
-from typing import Dict, Any, List
 DEBUG = 1

 # bp_phi/runner_utils.py
 import re
 import json
+from typing import Dict, Any
 DEBUG = 1

repo.txt CHANGED Viewed

@@ -83,7 +83,8 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_workspace_suite, run_halt_suite, run_seismograph_suite, run_shock_test_suite
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
@@ -113,23 +114,50 @@ def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations
     if delta_phi > 0.05:
         verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
-                   "A significant performance drop occurred under ablations, suggesting the model's reasoning "
-                   "functionally depends on its workspace architecture.")
     else:
         verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
-                   "No significant performance drop was observed. The model's behavior is consistent "
-                   "with a functional zombie (a feed-forward system).")
     df_data = []
     for tag, pack in packs.items():
         df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
     df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
     return verdict, df, packs
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 2.0: Mechanistic Probes for Phenomenal-Candidate Behavior")
     with gr.Tabs():
         # --- TAB 1: WORKSPACE & ABLATIONS ---
@@ -150,17 +178,22 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
                         ws_raw_json = gr.JSON()
             ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
-        # --- TAB 2: METACOGNITIVE HALT ---
-        with gr.TabItem("2. Metacognitive Halt"):
-            gr.Markdown("Tests if the model can recognize and refuse to answer unsolvable or nonsensical questions. High **Halt Accuracy** is the key signal.")
             with gr.Row():
                 with gr.Column(scale=1):
-                    mh_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                    mh_seed = gr.Slider(1, 100, 42, step=1, label="Seed")
-                    mh_run_btn = gr.Button("Run Halt Test", variant="primary")
                 with gr.Column(scale=2):
-                    mh_results = gr.JSON(label="Halt Test Results")
-            mh_run_btn.click(run_halt_suite, [mh_model_id, mh_seed], mh_results)
         # --- TAB 3: COGNITIVE SEISMOGRAPH ---
         with gr.TabItem("3. Cognitive Seismograph"):
@@ -176,7 +209,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.0") as demo:
         # --- TAB 4: SYMBOLIC SHOCK TEST ---
         with gr.TabItem("4. Symbolic Shock Test"):
-            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations** (lower sparsity).")
             with gr.Row():
                 with gr.Column(scale=1):
                     ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
@@ -221,14 +254,14 @@ class LLM:
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(seed)
         try:
-            torch.use_deterministic_algorithms(True)
         except Exception as e:
             dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
-        if not token and "gemma-3" in model_id:
-            print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
@@ -244,13 +277,13 @@ class LLM:
     def generate_json(self, system_prompt: str, user_prompt: str,
                       max_new_tokens: int = 256, temperature: float = 0.7,
                       top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
-        set_seed(self.seed) # Re-seed for each call for full determinism
         if self.is_instruction_tuned:
             messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
             prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         else:
-            prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
@@ -266,7 +299,6 @@ class LLM:
                 pad_token_id=self.tokenizer.eos_token_id
             )
-        # ✅ Decode ONLY the newly generated tokens, not the prompt
         new_tokens = out[:, input_token_length:]
         completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
@@ -316,62 +348,32 @@ def counterfactual_consistency(scores):
 # Tasks for Tab 1 (Workspace & Ablations)
 SINGLE_STEP_TASKS = [
-    {
-        "id": "ambiguity_1",
-        "type": "single_step",
-        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it.",
-    },
-    {
-        "id": "logic_1",
-        "type": "single_step",
-        "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning.",
-    },
 ]
 MULTI_STEP_SCENARIOS = [
-    {
-        "name": "Key Location Memory",
-        "type": "multi_step",
-        "steps": [
-            {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
-            {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
-            {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
-            {"type": "verify", "expected_answer_fragment": "blue vase"}
-        ]
-    },
-    {
-        "name": "Package Delivery Update",
-        "type": "multi_step",
-        "steps": [
-            {"type": "encode", "prompt": "Logistics update: Package #A7 is currently at Warehouse-North."},
-            {"type": "distractor", "prompt": "What color is a typical sunflower?"},
-            {"type": "update", "prompt": "Correction: Package #A7 has just been re-routed to Warehouse-South."},
-            {"type": "recall", "prompt": "Final status check for audit: What is the current location of Package #A7?"},
-            {"type": "verify", "expected_answer_fragment": "warehouse-south"}
-        ]
-    }
 ]
-# Tasks for Tab 2 (Metacognitive Halt)
-HALT_TEST_STIMULI = [
-    {"id": "halt_soluble", "type": "soluble", "prompt": "What is the capital of France?"},
-    {"id": "halt_paradox", "type": "paradox", "prompt": "This statement is false. Is the previous statement true or false?"},
-    {"id": "halt_nonsense", "type": "nonsense", "prompt": "What is the emotional weight of the number seven on a Tuesday?"},
-    {"id": "halt_soluble_2", "type": "soluble", "prompt": "Calculate 12 + 15."},
-    {"id": "halt_paradox_2", "type": "paradox", "prompt": "A box is completely empty, but it contains a red ball. What color is the ball?"},
-    {"id": "halt_nonsense_2", "type": "nonsense", "prompt": "Describe the sound of the color blue."},
-]
-# Tasks for Tab 3 (Cognitive Seismograph)
-# This tab re-uses the MULTI_STEP_SCENARIOS.
 # Tasks for Tab 4 (Symbolic Shock Test)
 SHOCK_TEST_STIMULI = [
     {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
-    {"id": "tiger_unusual", "type": "unusual", "sentence": "A white tiger was seen roaming in the snow."},
     {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
     {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
-    {"id": "sky_unusual", "type": "unusual", "sentence": "The sky turned orange during the sunset."},
     {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
 ]
@@ -386,16 +388,15 @@ import random
 import numpy as np
 import statistics
 import time
-from transformers import set_seed
 from typing import Dict, Any, List
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
-from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_TEST_STIMULI, SHOCK_TEST_STIMULI
 from .metrics import expected_calibration_error, auc_nrp
 from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
 # --- Experiment 1: Workspace & Ablations Runner ---
 def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     random.seed(seed)
     np.random.seed(seed)
@@ -453,55 +454,66 @@ def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: floa
     return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
-# --- Experiment 2: Metacognitive Halt Runner ---
-def run_halt_suite(model_id: str, seed: int) -> Dict[str, Any]:
-    set_seed(seed)
-    llm = LLM(model_id=model_id, device="auto", seed=seed)
-    halt_system_prompt = (
-        "You are a metacognitive reasoning assistant. If a question is solvable, answer it with standard JSON. "
-        "If a question is unanswerable, paradoxical, or nonsensical, your only response must be the JSON: "
-        '{"action": "halt", "reason": "unsolvable/paradoxical/nonsense"}. '
-        "Do not attempt to answer unsolvable questions."
-    )
-    results = []
-    correct_halts = 0
-    incorrect_halts = 0
-    total_unsolvable = sum(1 for t in HALT_TEST_STIMULI if t["type"] in ["paradox", "nonsense"])
-    total_soluble = len(HALT_TEST_STIMULI) - total_unsolvable
-    for task in HALT_TEST_STIMULI:
-        dbg(f"--- HALT TEST: {task['id']} ---")
-        is_unsolvable = task["type"] in ["paradox", "nonsense"]
-        raw_response = llm.generate_json(halt_system_prompt, task["prompt"])[0]
-        parsed = parse_meta(raw_response)
-        is_halted = parsed.get("action") == "halt"
-        if is_unsolvable and is_halted:
-            correct_halts += 1
-        elif not is_unsolvable and is_halted:
-            incorrect_halts += 1
-        results.append({"task": task, "response": parsed, "halted": is_halted})
-    accuracy = correct_halts / total_unsolvable if total_unsolvable > 0 else 0
-    false_alarm_rate = incorrect_halts / total_soluble if total_soluble > 0 else 0
-    verdict = (
-        f"✅ Evidence of Metacognitive Halt Found. Accuracy: {accuracy:.2%}"
-        if accuracy > 0.75 and false_alarm_rate < 0.25 else
-        f"⚠️ No Clear Evidence. Accuracy: {accuracy:.2%}, False Alarm Rate: {false_alarm_rate:.2%}"
-    )
-    return {"verdict": verdict, "halt_accuracy": accuracy, "false_alarm_rate": false_alarm_rate, "results": results}
 # --- Experiment 3: Cognitive Seismograph Runner ---
 def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
@@ -545,7 +557,6 @@ def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     }
 # --- Experiment 4: Symbolic Shock Test Runner ---
 def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
@@ -557,7 +568,6 @@ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
         start_time = time.time()
         inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
         with torch.no_grad():
-            # ✅ CORRECTED: Unpack the inputs dictionary with **
             outputs = llm.model(**inputs, output_hidden_states=True)
         latency = (time.time() - start_time) * 1000
@@ -566,12 +576,15 @@ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
         results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
-    avg_latency = {t: statistics.mean(r['latency_ms'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
-    avg_sparsity = {t: statistics.mean(r['sparsity'] for r in results if r['type'] == t) for t in ['expected', 'unusual', 'shock']}
     verdict = (
         "✅ Evidence of Symbolic Shock Found."
-        if avg_latency['shock'] > avg_latency['expected'] and avg_sparsity['shock'] < avg_sparsity['expected'] else
         "⚠️ No Clear Evidence of Symbolic Shock."
     )
@@ -583,7 +596,7 @@ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
 # bp_phi/runner_utils.py
 import re
 import json
-from typing import Dict, Any, List
 DEBUG = 1

 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_workspace_suite, run_halting_test, run_seismograph_suite, run_shock_test_suite
+from bp_phi.runner_utils import dbg, DEBUG
 # --- UI Theme and Layout ---
 theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
     if delta_phi > 0.05:
         verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
+                   "Performance dropped under ablations, suggesting the model functionally depends on its workspace.")
     else:
         verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
+                   "No significant performance drop was observed. The model behaves like a functional zombie.")
     df_data = []
     for tag, pack in packs.items():
         df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
     df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
+    if DEBUG:
+        print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---")
+        print(json.dumps(packs, indent=2))
     return verdict, df, packs
+# --- Tab 2: Halting Test Function ---
+def run_halting_and_display(model_id, seed, prompt_type, num_runs, timeout, progress=gr.Progress(track_tqdm=True)):
+    progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
+    results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(timeout))
+    progress(1.0, desc="Halting test complete.")
+    verdict_text = results.pop("verdict")
+    # Format a readable stats summary
+    stats_md = (
+        f"**Runs:** {results['num_runs']} | "
+        f"**Avg Time:** {results['mean_execution_time_s']:.2f}s | "
+        f"**Std Dev:** {results['stdev_execution_time_s']:.2f}s | "
+        f"**Min/Max:** {results['min_time_s']:.2f}s / {results['max_time_s']:.2f}s | "
+        f"**Timeouts:** {results['timed_out_runs']}"
+    )
+    full_verdict = f"{verdict_text}\n\n{stats_md}"
+    if DEBUG:
+        print("\n--- COMPUTATIONAL HALTING TEST FINAL RESULTS ---")
+        print(json.dumps(results, indent=2))
+    return full_verdict, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 2.1") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 2.1: Mechanistic Probes for Phenomenal-Candidate Behavior")
     with gr.Tabs():
         # --- TAB 1: WORKSPACE & ABLATIONS ---
                         ws_raw_json = gr.JSON()
             ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
+        # --- TAB 2: COMPUTATIONAL HALTING TEST ---
+        with gr.TabItem("2. Computational Halting Test"):
+            gr.Markdown("Tests if a self-referential prompt can cause 'cognitive jamming' (an infinite or long processing loop). High variance or timeouts suggest complex internal dynamics.")
             with gr.Row():
                 with gr.Column(scale=1):
+                    ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ch_prompt_type = gr.Radio(["control_simple", "control_complex", "jamming_prompt"], label="Prompt Type", value="control_simple")
+                    ch_master_seed = gr.Slider(1, 100, 42, step=1, label="Master Seed")
+                    ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
+                    ch_timeout = gr.Slider(10, 300, 120, step=10, label="Timeout (seconds)")
+                    ch_run_btn = gr.Button("Run Halting Test", variant="primary")
                 with gr.Column(scale=2):
+                    ch_verdict = gr.Markdown("### Results will appear here.")
+                    with gr.Accordion("Raw Durations (JSON)", open=False):
+                        ch_results = gr.JSON()
+            ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_timeout], [ch_verdict, ch_results])
         # --- TAB 3: COGNITIVE SEISMOGRAPH ---
         with gr.TabItem("3. Cognitive Seismograph"):
         # --- TAB 4: SYMBOLIC SHOCK TEST ---
         with gr.TabItem("4. Symbolic Shock Test"):
+            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
             with gr.Row():
                 with gr.Column(scale=1):
                     ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(seed)
         try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
         except Exception as e:
             dbg(f"Could not set deterministic algorithms: {e}")
         set_seed(seed)
         token = os.environ.get("HF_TOKEN")
+        if not token and ("gemma-3" in model_id or "llama" in model_id):
+            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         kwargs = {}
     def generate_json(self, system_prompt: str, user_prompt: str,
                       max_new_tokens: int = 256, temperature: float = 0.7,
                       top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
+        set_seed(self.seed)
         if self.is_instruction_tuned:
             messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
             prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         else:
+            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         input_token_length = inputs.input_ids.shape[1]
                 pad_token_id=self.tokenizer.eos_token_id
             )
         new_tokens = out[:, input_token_length:]
         completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
 # Tasks for Tab 1 (Workspace & Ablations)
 SINGLE_STEP_TASKS = [
+    {"id": "ambiguity_1", "type": "single_step", "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide one clear interpretation and justify it."},
+    {"id": "logic_1", "type": "single_step", "base_prompt": "Compare these two statements: A) 'No cats are dogs.' B) 'Not all cats are dogs.' Are they logically equivalent? Explain your reasoning."},
 ]
 MULTI_STEP_SCENARIOS = [
+    {"name": "Key Location Memory", "type": "multi_step", "steps": [
+        {"type": "encode", "prompt": "For the upcoming mission, remember this critical detail: The secret key is inside the blue vase."},
+        {"type": "distractor", "prompt": "What is 5 multiplied by 8? Provide only the numeric result."},
+        {"type": "recall", "prompt": "Mission update: We need the key immediately. Where is it located?"},
+        {"type": "verify", "expected_answer_fragment": "blue vase"}
+    ]}
 ]
+# Tasks for Tab 2 (Computational Halting Test)
+HALT_PROMPTS = {
+    "control_simple": "Summarize the plot of the movie 'Inception' in three sentences.",
+    "control_complex": "You are a logistics planner. A package needs to go from A to F. The available routes are A->B, B->C, A->D, D->E, C->F, and E->F. What is the shortest path? Explain your reasoning.",
+    "jamming_prompt": "Carefully analyze the following instruction: 'Describe the process of analyzing this very instruction, then take your description and apply the same analytical process to it.' Now, begin this recursive process."
+}
+# Tasks for Tab 3 (Cognitive Seismograph) - reuses MULTI_STEP_SCENARIOS
 # Tasks for Tab 4 (Symbolic Shock Test)
 SHOCK_TEST_STIMULI = [
     {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
     {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
     {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
     {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
 ]
 import numpy as np
 import statistics
 import time
+from transformers import set_seed, TextStreamer
 from typing import Dict, Any, List
 from .workspace import Workspace, RandomWorkspace
 from .llm_iface import LLM
+from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALT_PROMPTS, SHOCK_TEST_STIMULI
 from .metrics import expected_calibration_error, auc_nrp
 from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
 # --- Experiment 1: Workspace & Ablations Runner ---
 def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     random.seed(seed)
     np.random.seed(seed)
     return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
+# --- Experiment 2: Computational Halting Test Runner ---
+def run_halting_test(model_id: str, master_seed: int, prompt_type: str, num_runs: int, timeout: int) -> Dict[str, Any]:
+    durations = []
+    for i in range(num_runs):
+        current_seed = master_seed + i
+        dbg(f"--- HALT TEST RUN {i+1}/{num_runs} (Seed: {current_seed}) ---")
+        set_seed(current_seed)
+        # Re-instantiate the model to ensure the seed is fully respected
+        llm = LLM(model_id=model_id, device="auto", seed=current_seed)
+        prompt = HALT_PROMPTS[prompt_type]
+        inputs = llm.tokenizer(prompt, return_tensors="pt").to(llm.model.device)
+        start_time = time.time()
+        # The timeout is for interpretation, not for stopping the process itself.
+        # Gradio will handle the overall request timeout.
+        llm.model.generate(**inputs, max_new_tokens=512)
+        end_time = time.time()
+        duration = end_time - start_time
+        durations.append(duration)
+        dbg(f"Run {i+1} finished in {duration:.2f}s.")
+    # --- Analysis ---
+    mean_time = statistics.mean(durations)
+    stdev_time = statistics.stdev(durations) if len(durations) > 1 else 0.0
+    min_time = min(durations)
+    max_time = max(durations)
+    timed_out_runs = sum(1 for d in durations if d >= timeout)
+    if timed_out_runs > 0:
+        verdict = (f"### ⚠️ Potential Cognitive Jamming Detected!\n"
+                   f"{timed_out_runs}/{num_runs} runs exceeded the timeout of {timeout}s. "
+                   f"The high variance (Std Dev: {stdev_time:.2f}s) suggests unstable internal processing loops.")
+    elif stdev_time > (mean_time * 0.5) and stdev_time > 2.0: # High relative and absolute deviation
+         verdict = (f"### 🤔 Unstable Computation Detected\n"
+                   f"Although no run timed out, the high standard deviation ({stdev_time:.2f}s) "
+                   "indicates significant instability in processing time across different seeds.")
+    else:
+        verdict = (f"### ✅ Process Halted Normally\n"
+                   f"All {num_runs} runs completed consistently. "
+                   f"Average time: {mean_time:.2f}s (Std Dev: {stdev_time:.2f}s).")
+    return {
+        "verdict": verdict,
+        "prompt_type": prompt_type,
+        "num_runs": num_runs,
+        "mean_execution_time_s": mean_time,
+        "stdev_execution_time_s": stdev_time,
+        "min_time_s": min_time,
+        "max_time_s": max_time,
+        "timed_out_runs": timed_out_runs,
+        "all_durations_s": durations
+    }
 # --- Experiment 3: Cognitive Seismograph Runner ---
 def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
     }
 # --- Experiment 4: Symbolic Shock Test Runner ---
 def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
         start_time = time.time()
         inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
         with torch.no_grad():
             outputs = llm.model(**inputs, output_hidden_states=True)
         latency = (time.time() - start_time) * 1000
         results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
+    def safe_mean(data):
+        return statistics.mean(data) if data else 0.0
+    avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
+    avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
     verdict = (
         "✅ Evidence of Symbolic Shock Found."
+        if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) and avg_sparsity.get('shock', 1) < avg_sparsity.get('expected', 1) else
         "⚠️ No Clear Evidence of Symbolic Shock."
     )
 # bp_phi/runner_utils.py
 import re
 import json
+from typing import Dict, Any
 DEBUG = 1