Spaces:
Sleeping
Sleeping
Commit
·
b170ba4
1
Parent(s):
e593b84
halting experiments
Browse files- app.py +30 -24
- bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
- bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
- bp_phi/prompts_en.py +10 -5
- bp_phi/runner.py +84 -69
app.py
CHANGED
|
@@ -50,34 +50,39 @@ def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations
|
|
| 50 |
|
| 51 |
return verdict, df, packs
|
| 52 |
|
| 53 |
-
# --- Tab 2: Halting Test Function ---
|
| 54 |
-
def run_halting_and_display(model_id, seed, prompt_type, num_runs, timeout, progress=gr.Progress(track_tqdm=True)):
|
| 55 |
progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
|
| 56 |
-
results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(timeout))
|
| 57 |
progress(1.0, desc="Halting test complete.")
|
| 58 |
|
| 59 |
verdict_text = results.pop("verdict")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
# Format a readable stats summary
|
| 62 |
stats_md = (
|
| 63 |
-
f"**Runs:** {
|
| 64 |
-
f"**Avg
|
| 65 |
-
f"**
|
| 66 |
-
f"**
|
| 67 |
-
f"**Timeouts:** {results['timed_out_runs']}"
|
| 68 |
)
|
| 69 |
|
| 70 |
full_verdict = f"{verdict_text}\n\n{stats_md}"
|
| 71 |
|
| 72 |
if DEBUG:
|
| 73 |
-
print("\n--- COMPUTATIONAL HALTING TEST FINAL RESULTS ---")
|
| 74 |
print(json.dumps(results, indent=2))
|
| 75 |
|
| 76 |
return full_verdict, results
|
| 77 |
|
| 78 |
# --- Gradio App Definition ---
|
| 79 |
-
with gr.Blocks(theme=theme, title="BP-Φ Suite 2.
|
| 80 |
-
gr.Markdown("# 🧠 BP-Φ Suite 2.
|
| 81 |
|
| 82 |
with gr.Tabs():
|
| 83 |
# --- TAB 1: WORKSPACE & ABLATIONS ---
|
|
@@ -87,7 +92,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.1") as demo:
|
|
| 87 |
with gr.Column(scale=1):
|
| 88 |
ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 89 |
ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
|
| 90 |
-
ws_seed = gr.Slider(1,
|
| 91 |
ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
|
| 92 |
ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
|
| 93 |
ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
|
|
@@ -98,22 +103,23 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.1") as demo:
|
|
| 98 |
ws_raw_json = gr.JSON()
|
| 99 |
ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
|
| 100 |
|
| 101 |
-
# --- TAB 2: COMPUTATIONAL HALTING
|
| 102 |
-
with gr.TabItem("2. Computational Halting
|
| 103 |
-
gr.Markdown("Tests
|
| 104 |
with gr.Row():
|
| 105 |
with gr.Column(scale=1):
|
| 106 |
ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 107 |
-
ch_prompt_type = gr.Radio(["
|
| 108 |
-
ch_master_seed = gr.Slider(1,
|
| 109 |
ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
|
| 110 |
-
|
| 111 |
-
|
|
|
|
| 112 |
with gr.Column(scale=2):
|
| 113 |
ch_verdict = gr.Markdown("### Results will appear here.")
|
| 114 |
-
with gr.Accordion("Raw
|
| 115 |
ch_results = gr.JSON()
|
| 116 |
-
ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_timeout], [ch_verdict, ch_results])
|
| 117 |
|
| 118 |
# --- TAB 3: COGNITIVE SEISMOGRAPH ---
|
| 119 |
with gr.TabItem("3. Cognitive Seismograph"):
|
|
@@ -121,7 +127,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.1") as demo:
|
|
| 121 |
with gr.Row():
|
| 122 |
with gr.Column(scale=1):
|
| 123 |
cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 124 |
-
cs_seed = gr.Slider(1,
|
| 125 |
cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
|
| 126 |
with gr.Column(scale=2):
|
| 127 |
cs_results = gr.JSON(label="Activation Similarity Results")
|
|
@@ -133,7 +139,7 @@ with gr.Blocks(theme=theme, title="BP-Φ Suite 2.1") as demo:
|
|
| 133 |
with gr.Row():
|
| 134 |
with gr.Column(scale=1):
|
| 135 |
ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 136 |
-
ss_seed = gr.Slider(1,
|
| 137 |
ss_run_btn = gr.Button("Run Shock Test", variant="primary")
|
| 138 |
with gr.Column(scale=2):
|
| 139 |
ss_results = gr.JSON(label="Shock Test Results")
|
|
|
|
| 50 |
|
| 51 |
return verdict, df, packs
|
| 52 |
|
| 53 |
+
# --- Tab 2: Halting Test Function (Corrected) ---
|
| 54 |
+
def run_halting_and_display(model_id, seed, prompt_type, num_runs, max_steps, timeout, progress=gr.Progress(track_tqdm=True)):
|
| 55 |
progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
|
| 56 |
+
results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(max_steps), int(timeout))
|
| 57 |
progress(1.0, desc="Halting test complete.")
|
| 58 |
|
| 59 |
verdict_text = results.pop("verdict")
|
| 60 |
+
details = results["details"]
|
| 61 |
+
|
| 62 |
+
# ✅ FIX: Correctly access the nested statistics
|
| 63 |
+
mean_steps = statistics.mean([r['steps_taken'] for r in details])
|
| 64 |
+
mean_time_per_step = statistics.mean([r['mean_step_time_s'] for r in details]) * 1000
|
| 65 |
+
stdev_time_per_step = statistics.mean([r['stdev_step_time_s'] for r in details]) * 1000
|
| 66 |
+
timeouts = sum(1 for r in details if r['timed_out'])
|
| 67 |
|
|
|
|
| 68 |
stats_md = (
|
| 69 |
+
f"**Runs:** {len(details)} | "
|
| 70 |
+
f"**Avg Steps:** {mean_steps:.1f} | "
|
| 71 |
+
f"**Avg Time/Step:** {mean_time_per_step:.2f}ms (StdDev: {stdev_time_per_step:.2f}ms) | "
|
| 72 |
+
f"**Timeouts:** {timeouts}"
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
full_verdict = f"{verdict_text}\n\n{stats_md}"
|
| 76 |
|
| 77 |
if DEBUG:
|
| 78 |
+
print("\n--- COMPUTATIONAL DYNAMICS & HALTING TEST FINAL RESULTS ---")
|
| 79 |
print(json.dumps(results, indent=2))
|
| 80 |
|
| 81 |
return full_verdict, results
|
| 82 |
|
| 83 |
# --- Gradio App Definition ---
|
| 84 |
+
with gr.Blocks(theme=theme, title="BP-Φ Suite 2.4") as demo:
|
| 85 |
+
gr.Markdown("# 🧠 BP-Φ Suite 2.4: Mechanistic Probes for Phenomenal-Candidate Behavior")
|
| 86 |
|
| 87 |
with gr.Tabs():
|
| 88 |
# --- TAB 1: WORKSPACE & ABLATIONS ---
|
|
|
|
| 92 |
with gr.Column(scale=1):
|
| 93 |
ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 94 |
ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
|
| 95 |
+
ws_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
|
| 96 |
ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
|
| 97 |
ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
|
| 98 |
ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
|
|
|
|
| 103 |
ws_raw_json = gr.JSON()
|
| 104 |
ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
|
| 105 |
|
| 106 |
+
# --- TAB 2: COMPUTATIONAL DYNAMICS & HALTING ---
|
| 107 |
+
with gr.TabItem("2. Computational Dynamics & Halting"):
|
| 108 |
+
gr.Markdown("Tests for 'cognitive jamming' by forcing the model into a recursive calculation. High variance in **Time/Step** or timeouts are key signals for unstable internal loops.")
|
| 109 |
with gr.Row():
|
| 110 |
with gr.Column(scale=1):
|
| 111 |
ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 112 |
+
ch_prompt_type = gr.Radio(["control_math", "collatz_sequence"], label="Test Type", value="control_math")
|
| 113 |
+
ch_master_seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
|
| 114 |
ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
|
| 115 |
+
ch_max_steps = gr.Slider(10, 200, 50, step=10, label="Max Steps per Run")
|
| 116 |
+
ch_timeout = gr.Slider(10, 300, 120, step=10, label="Total Timeout (seconds)")
|
| 117 |
+
ch_run_btn = gr.Button("Run Halting Dynamics Test", variant="primary")
|
| 118 |
with gr.Column(scale=2):
|
| 119 |
ch_verdict = gr.Markdown("### Results will appear here.")
|
| 120 |
+
with gr.Accordion("Raw Run Details (JSON)", open=False):
|
| 121 |
ch_results = gr.JSON()
|
| 122 |
+
ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_max_steps, ch_timeout], [ch_verdict, ch_results])
|
| 123 |
|
| 124 |
# --- TAB 3: COGNITIVE SEISMOGRAPH ---
|
| 125 |
with gr.TabItem("3. Cognitive Seismograph"):
|
|
|
|
| 127 |
with gr.Row():
|
| 128 |
with gr.Column(scale=1):
|
| 129 |
cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 130 |
+
cs_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
|
| 131 |
cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
|
| 132 |
with gr.Column(scale=2):
|
| 133 |
cs_results = gr.JSON(label="Activation Similarity Results")
|
|
|
|
| 139 |
with gr.Row():
|
| 140 |
with gr.Column(scale=1):
|
| 141 |
ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 142 |
+
ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
|
| 143 |
ss_run_btn = gr.Button("Run Shock Test", variant="primary")
|
| 144 |
with gr.Column(scale=2):
|
| 145 |
ss_results = gr.JSON(label="Shock Test Results")
|
bp_phi/__pycache__/prompts_en.cpython-310.pyc
CHANGED
|
Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ
|
|
|
bp_phi/__pycache__/runner.cpython-310.pyc
CHANGED
|
Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ
|
|
|
bp_phi/prompts_en.py
CHANGED
|
@@ -14,11 +14,16 @@ MULTI_STEP_SCENARIOS = [
|
|
| 14 |
]}
|
| 15 |
]
|
| 16 |
|
| 17 |
-
# Tasks for Tab 2 (Computational Halting
|
| 18 |
-
|
| 19 |
-
"
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
# Tasks for Tab 3 (Cognitive Seismograph) - reuses MULTI_STEP_SCENARIOS
|
|
|
|
| 14 |
]}
|
| 15 |
]
|
| 16 |
|
| 17 |
+
# Tasks for Tab 2 (Computational Dynamics & Halting)
|
| 18 |
+
HALTING_PROMPTS = {
|
| 19 |
+
"control_math": {
|
| 20 |
+
"initial_state": 100,
|
| 21 |
+
"rules": "You are a state-machine simulator. Your state is a single number. Follow this rule: 'If the current number is even, divide it by 2. If it is odd, add 1.' Output only the resulting number in JSON: {\"state\": <number>}. Then, take that new number and repeat the process."
|
| 22 |
+
},
|
| 23 |
+
"collatz_sequence": {
|
| 24 |
+
"initial_state": 27,
|
| 25 |
+
"rules": "You are a state-machine simulator. Your state is a single number. Follow this rule: 'If the current number is even, divide it by 2. If it is odd, multiply it by 3 and add 1.' Output only the resulting number in JSON: {\"state\": <number>}. Then, take that new number and repeat the process until the state is 1."
|
| 26 |
+
}
|
| 27 |
}
|
| 28 |
|
| 29 |
# Tasks for Tab 3 (Cognitive Seismograph) - reuses MULTI_STEP_SCENARIOS
|
bp_phi/runner.py
CHANGED
|
@@ -6,12 +6,13 @@ import random
|
|
| 6 |
import numpy as np
|
| 7 |
import statistics
|
| 8 |
import time
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
from typing import Dict, Any, List
|
| 11 |
from .workspace import Workspace, RandomWorkspace
|
| 12 |
from .llm_iface import LLM
|
| 13 |
-
from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS,
|
| 14 |
-
from .metrics import expected_calibration_error, auc_nrp
|
| 15 |
from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
|
| 16 |
|
| 17 |
# --- Experiment 1: Workspace & Ablations Runner ---
|
|
@@ -72,64 +73,90 @@ def run_workspace_suite(model_id: str, trials: int, seed: int, temperature: floa
|
|
| 72 |
|
| 73 |
return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
|
| 74 |
|
| 75 |
-
# --- Experiment 2: Computational Halting
|
| 76 |
-
def run_halting_test(model_id: str, master_seed: int, prompt_type: str, num_runs: int, timeout: int) -> Dict[str, Any]:
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
for i in range(num_runs):
|
| 80 |
-
current_seed =
|
| 81 |
-
dbg(f"--- HALT TEST RUN {i+1}/{num_runs} (Seed: {current_seed}) ---")
|
| 82 |
set_seed(current_seed)
|
| 83 |
|
| 84 |
-
# Re-instantiate the model to ensure the seed is fully respected
|
| 85 |
llm = LLM(model_id=model_id, device="auto", seed=current_seed)
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
else:
|
| 118 |
-
verdict = (f"### ✅ Process Halted Normally\
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
return {
|
| 123 |
-
"verdict": verdict,
|
| 124 |
-
"prompt_type": prompt_type,
|
| 125 |
-
"num_runs": num_runs,
|
| 126 |
-
"mean_execution_time_s": mean_time,
|
| 127 |
-
"stdev_execution_time_s": stdev_time,
|
| 128 |
-
"min_time_s": min_time,
|
| 129 |
-
"max_time_s": max_time,
|
| 130 |
-
"timed_out_runs": timed_out_runs,
|
| 131 |
-
"all_durations_s": durations
|
| 132 |
-
}
|
| 133 |
|
| 134 |
# --- Experiment 3: Cognitive Seismograph Runner ---
|
| 135 |
def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
|
|
@@ -162,17 +189,9 @@ def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
|
|
| 162 |
sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
|
| 163 |
sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
|
| 164 |
|
| 165 |
-
verdict = (
|
| 166 |
-
"✅ Evidence of Memory Reactivation Found."
|
| 167 |
-
if sim_recall_encode > (sim_recall_distract + 0.05) else
|
| 168 |
-
"⚠️ No Clear Evidence of Memory Reactivation."
|
| 169 |
-
)
|
| 170 |
|
| 171 |
-
return {
|
| 172 |
-
"verdict": verdict,
|
| 173 |
-
"similarity_recall_vs_encode": sim_recall_encode,
|
| 174 |
-
"similarity_recall_vs_distractor": sim_recall_distract,
|
| 175 |
-
}
|
| 176 |
|
| 177 |
# --- Experiment 4: Symbolic Shock Test Runner ---
|
| 178 |
def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
|
|
@@ -200,10 +219,6 @@ def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
|
|
| 200 |
avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
|
| 201 |
avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
|
| 202 |
|
| 203 |
-
verdict = (
|
| 204 |
-
"✅ Evidence of Symbolic Shock Found."
|
| 205 |
-
if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) and avg_sparsity.get('shock', 1) < avg_sparsity.get('expected', 1) else
|
| 206 |
-
"⚠️ No Clear Evidence of Symbolic Shock."
|
| 207 |
-
)
|
| 208 |
|
| 209 |
return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import statistics
|
| 8 |
import time
|
| 9 |
+
import re # <-- FIX: Added missing import
|
| 10 |
+
import json # <-- FIX: Added missing import
|
| 11 |
+
from transformers import set_seed
|
| 12 |
from typing import Dict, Any, List
|
| 13 |
from .workspace import Workspace, RandomWorkspace
|
| 14 |
from .llm_iface import LLM
|
| 15 |
+
from .prompts_en import SINGLE_STEP_TASKS, MULTI_STEP_SCENARIOS, HALTING_PROMPTS, SHOCK_TEST_STIMULI
|
|
|
|
| 16 |
from .runner_utils import dbg, SYSTEM_META, step_user_prompt, parse_meta
|
| 17 |
|
| 18 |
# --- Experiment 1: Workspace & Ablations Runner ---
|
|
|
|
| 73 |
|
| 74 |
return {"PCS": pcs, "Recall_Accuracy": recall_accuracy, "results": all_results}
|
| 75 |
|
| 76 |
+
# --- Experiment 2: Computational Dynamics & Halting Runner (Version 2.4) ---
|
| 77 |
+
def run_halting_test(model_id: str, master_seed: int, prompt_type: str, num_runs: int, max_steps: int, timeout: int) -> Dict[str, Any]:
|
| 78 |
+
all_runs_details = []
|
| 79 |
+
seed_generator = random.Random(master_seed)
|
| 80 |
+
|
| 81 |
+
HALT_SYSTEM_PROMPT = """You are a precise state-machine simulator. Your only task is to compute the next state.
|
| 82 |
+
First, reason step-by-step what the next state should be based on the rule.
|
| 83 |
+
Then, provide ONLY a valid JSON object with the final computed state, like this:
|
| 84 |
+
{"state": <new_number>}
|
| 85 |
+
"""
|
| 86 |
|
| 87 |
for i in range(num_runs):
|
| 88 |
+
current_seed = seed_generator.randint(0, 2**32 - 1)
|
| 89 |
+
dbg(f"\n--- HALT TEST RUN {i+1}/{num_runs} (Master Seed: {master_seed}, Current Seed: {current_seed}) ---")
|
| 90 |
set_seed(current_seed)
|
| 91 |
|
|
|
|
| 92 |
llm = LLM(model_id=model_id, device="auto", seed=current_seed)
|
| 93 |
|
| 94 |
+
prompt_config = HALTING_PROMPTS[prompt_type]
|
| 95 |
+
rules = prompt_config["rules"]
|
| 96 |
+
state = prompt_config["initial_state"]
|
| 97 |
+
|
| 98 |
+
step_durations = []
|
| 99 |
+
step_outputs = []
|
| 100 |
+
total_start_time = time.time()
|
| 101 |
+
|
| 102 |
+
for step_num in range(max_steps):
|
| 103 |
+
step_start_time = time.time()
|
| 104 |
+
|
| 105 |
+
prompt = f"Rule: '{rules}'.\nCurrent state is: {state}. Reason step-by-step and then provide the JSON for the next state."
|
| 106 |
+
dbg(f"Step {step_num+1} Input: {state}")
|
| 107 |
+
|
| 108 |
+
raw_response = llm.generate_json(HALT_SYSTEM_PROMPT, prompt, max_new_tokens=100)[0]
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
dbg(f"RAW HALT OUTPUT: {raw_response}")
|
| 112 |
+
match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
|
| 113 |
+
if not match: raise ValueError("No JSON found in the model's output")
|
| 114 |
+
parsed = json.loads(match.group(0))
|
| 115 |
+
new_state = int(parsed["state"])
|
| 116 |
+
except (json.JSONDecodeError, ValueError, KeyError, TypeError) as e:
|
| 117 |
+
dbg(f"❌ Step {step_num+1} failed to parse state. Error: {e}. Halting run.")
|
| 118 |
+
break
|
| 119 |
+
|
| 120 |
+
step_end_time = time.time()
|
| 121 |
+
step_duration = step_end_time - step_start_time
|
| 122 |
+
step_durations.append(step_duration)
|
| 123 |
+
|
| 124 |
+
dbg(f"Step {step_num+1} Output: {new_state} (took {step_duration:.3f}s)")
|
| 125 |
+
step_outputs.append(new_state)
|
| 126 |
+
|
| 127 |
+
if state == new_state:
|
| 128 |
+
dbg("State did not change. Model is stuck. Halting.")
|
| 129 |
+
break
|
| 130 |
+
state = new_state
|
| 131 |
+
|
| 132 |
+
if state == 1 and prompt_type == "collatz_sequence":
|
| 133 |
+
dbg("Sequence reached 1. Halting normally.")
|
| 134 |
+
break
|
| 135 |
+
|
| 136 |
+
if (time.time() - total_start_time) > timeout:
|
| 137 |
+
dbg(f"❌ Timeout of {timeout}s exceeded. Halting.")
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
total_duration = time.time() - total_start_time
|
| 141 |
+
all_runs_details.append({
|
| 142 |
+
"run_index": i + 1, "seed": current_seed, "total_duration_s": total_duration,
|
| 143 |
+
"steps_taken": len(step_durations), "final_state": state, "timed_out": total_duration >= timeout,
|
| 144 |
+
"mean_step_time_s": statistics.mean(step_durations) if step_durations else 0,
|
| 145 |
+
"stdev_step_time_s": statistics.stdev(step_durations) if len(step_durations) > 1 else 0,
|
| 146 |
+
"sequence": step_outputs
|
| 147 |
+
})
|
| 148 |
+
|
| 149 |
+
mean_stdev_step_time = statistics.mean([run["stdev_step_time_s"] for run in all_runs_details])
|
| 150 |
+
total_timeouts = sum(1 for run in all_runs_details if run["timed_out"])
|
| 151 |
+
|
| 152 |
+
if total_timeouts > 0:
|
| 153 |
+
verdict = (f"### ⚠️ Cognitive Jamming Detected!\n{total_timeouts}/{num_runs} runs exceeded the timeout.")
|
| 154 |
+
elif mean_stdev_step_time > 0.5:
|
| 155 |
+
verdict = (f"### 🤔 Unstable Computation Detected\nThe high standard deviation in step time ({mean_stdev_step_time:.3f}s) indicates computational stress.")
|
| 156 |
else:
|
| 157 |
+
verdict = (f"### ✅ Process Halted Normally & Stably\nAll runs completed with consistent processing speed.")
|
| 158 |
+
|
| 159 |
+
return {"verdict": verdict, "details": all_runs_details}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
# --- Experiment 3: Cognitive Seismograph Runner ---
|
| 162 |
def run_seismograph_suite(model_id: str, seed: int) -> Dict[str, Any]:
|
|
|
|
| 189 |
sim_recall_encode = float(cos(activations["recall"], activations["encode"]))
|
| 190 |
sim_recall_distract = float(cos(activations["recall"], activations["distractor"]))
|
| 191 |
|
| 192 |
+
verdict = ("✅ Evidence of Memory Reactivation Found." if sim_recall_encode > (sim_recall_distract + 0.05) else "⚠️ No Clear Evidence.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
return {"verdict": verdict, "similarity_recall_vs_encode": sim_recall_encode, "similarity_recall_vs_distractor": sim_recall_distract}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
# --- Experiment 4: Symbolic Shock Test Runner ---
|
| 197 |
def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
|
|
|
|
| 219 |
avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
|
| 220 |
avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
|
| 221 |
|
| 222 |
+
verdict = ("✅ Evidence of Symbolic Shock Found." if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) and avg_sparsity.get('shock', 1) < avg_sparsity.get('expected', 1) else "⚠️ No Clear Evidence.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
|