llm_qualia_2 / app.py
neuralworm's picture
halting experiments
b170ba4
raw
history blame
8.33 kB
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_workspace_suite, run_halting_test, run_seismograph_suite, run_shock_test_suite
from bp_phi.runner_utils import dbg, DEBUG
# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
button_primary_background_fill="*primary_500", button_primary_text_color="white",
)
# --- Tab 1: Workspace & Ablations Functions ---
def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
packs = {}
ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
progress(0, desc="Running Baseline...")
base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
packs["baseline"] = base_pack
for i, ab in enumerate(ablation_modes):
progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
packs[ab] = pack
progress(1.0, desc="Analysis complete.")
base_pcs = packs["baseline"]["PCS"]
ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
if delta_phi > 0.05:
verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
"Performance dropped under ablations, suggesting the model functionally depends on its workspace.")
else:
verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
"No significant performance drop was observed. The model behaves like a functional zombie.")
df_data = []
for tag, pack in packs.items():
df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
if DEBUG:
print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---")
print(json.dumps(packs, indent=2))
return verdict, df, packs
# --- Tab 2: Halting Test Function (Corrected) ---
def run_halting_and_display(model_id, seed, prompt_type, num_runs, max_steps, timeout, progress=gr.Progress(track_tqdm=True)):
progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(max_steps), int(timeout))
progress(1.0, desc="Halting test complete.")
verdict_text = results.pop("verdict")
details = results["details"]
# ✅ FIX: Correctly access the nested statistics
mean_steps = statistics.mean([r['steps_taken'] for r in details])
mean_time_per_step = statistics.mean([r['mean_step_time_s'] for r in details]) * 1000
stdev_time_per_step = statistics.mean([r['stdev_step_time_s'] for r in details]) * 1000
timeouts = sum(1 for r in details if r['timed_out'])
stats_md = (
f"**Runs:** {len(details)} | "
f"**Avg Steps:** {mean_steps:.1f} | "
f"**Avg Time/Step:** {mean_time_per_step:.2f}ms (StdDev: {stdev_time_per_step:.2f}ms) | "
f"**Timeouts:** {timeouts}"
)
full_verdict = f"{verdict_text}\n\n{stats_md}"
if DEBUG:
print("\n--- COMPUTATIONAL DYNAMICS & HALTING TEST FINAL RESULTS ---")
print(json.dumps(results, indent=2))
return full_verdict, results
# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 2.4") as demo:
gr.Markdown("# 🧠 BP-Φ Suite 2.4: Mechanistic Probes for Phenomenal-Candidate Behavior")
with gr.Tabs():
# --- TAB 1: WORKSPACE & ABLATIONS ---
with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
with gr.Row():
with gr.Column(scale=1):
ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
ws_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
with gr.Column(scale=2):
ws_verdict = gr.Markdown("### Results will appear here.")
ws_summary_df = gr.DataFrame(label="Summary Metrics")
with gr.Accordion("Raw JSON Output", open=False):
ws_raw_json = gr.JSON()
ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
# --- TAB 2: COMPUTATIONAL DYNAMICS & HALTING ---
with gr.TabItem("2. Computational Dynamics & Halting"):
gr.Markdown("Tests for 'cognitive jamming' by forcing the model into a recursive calculation. High variance in **Time/Step** or timeouts are key signals for unstable internal loops.")
with gr.Row():
with gr.Column(scale=1):
ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
ch_prompt_type = gr.Radio(["control_math", "collatz_sequence"], label="Test Type", value="control_math")
ch_master_seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
ch_max_steps = gr.Slider(10, 200, 50, step=10, label="Max Steps per Run")
ch_timeout = gr.Slider(10, 300, 120, step=10, label="Total Timeout (seconds)")
ch_run_btn = gr.Button("Run Halting Dynamics Test", variant="primary")
with gr.Column(scale=2):
ch_verdict = gr.Markdown("### Results will appear here.")
with gr.Accordion("Raw Run Details (JSON)", open=False):
ch_results = gr.JSON()
ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_max_steps, ch_timeout], [ch_verdict, ch_results])
# --- TAB 3: COGNITIVE SEISMOGRAPH ---
with gr.TabItem("3. Cognitive Seismograph"):
gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
with gr.Row():
with gr.Column(scale=1):
cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
cs_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
with gr.Column(scale=2):
cs_results = gr.JSON(label="Activation Similarity Results")
cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
# --- TAB 4: SYMBOLIC SHOCK TEST ---
with gr.TabItem("4. Symbolic Shock Test"):
gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
with gr.Row():
with gr.Column(scale=1):
ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
ss_run_btn = gr.Button("Run Shock Test", variant="primary")
with gr.Column(scale=2):
ss_results = gr.JSON(label="Shock Test Results")
ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)