Spaces:
Sleeping
Sleeping
File size: 8,326 Bytes
0916370 2f0addb 0916370 e593b84 2f0addb 88c294a 0916370 2f0addb 88c294a 2f0addb 0916370 2f0addb 0916370 88c294a 2f0addb 0916370 88c294a 0916370 88c294a 0916370 88c294a 2f0addb 88c294a e593b84 88c294a e593b84 2f0addb 88c294a 2f0addb 88c294a 0916370 e593b84 88c294a 2f0addb b170ba4 e593b84 b170ba4 e593b84 b170ba4 e593b84 b170ba4 e593b84 b170ba4 e593b84 0916370 b170ba4 88c294a b170ba4 88c294a b170ba4 88c294a e593b84 b170ba4 e593b84 b170ba4 88c294a e593b84 b170ba4 e593b84 b170ba4 88c294a b170ba4 88c294a e593b84 88c294a b170ba4 88c294a 2f0addb 0916370 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_workspace_suite, run_halting_test, run_seismograph_suite, run_shock_test_suite
from bp_phi.runner_utils import dbg, DEBUG
# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
button_primary_background_fill="*primary_500", button_primary_text_color="white",
)
# --- Tab 1: Workspace & Ablations Functions ---
def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
packs = {}
ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []
progress(0, desc="Running Baseline...")
base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
packs["baseline"] = base_pack
for i, ab in enumerate(ablation_modes):
progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
packs[ab] = pack
progress(1.0, desc="Analysis complete.")
base_pcs = packs["baseline"]["PCS"]
ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0
if delta_phi > 0.05:
verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
"Performance dropped under ablations, suggesting the model functionally depends on its workspace.")
else:
verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
"No significant performance drop was observed. The model behaves like a functional zombie.")
df_data = []
for tag, pack in packs.items():
df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])
if DEBUG:
print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---")
print(json.dumps(packs, indent=2))
return verdict, df, packs
# --- Tab 2: Halting Test Function (Corrected) ---
def run_halting_and_display(model_id, seed, prompt_type, num_runs, max_steps, timeout, progress=gr.Progress(track_tqdm=True)):
progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(max_steps), int(timeout))
progress(1.0, desc="Halting test complete.")
verdict_text = results.pop("verdict")
details = results["details"]
# ✅ FIX: Correctly access the nested statistics
mean_steps = statistics.mean([r['steps_taken'] for r in details])
mean_time_per_step = statistics.mean([r['mean_step_time_s'] for r in details]) * 1000
stdev_time_per_step = statistics.mean([r['stdev_step_time_s'] for r in details]) * 1000
timeouts = sum(1 for r in details if r['timed_out'])
stats_md = (
f"**Runs:** {len(details)} | "
f"**Avg Steps:** {mean_steps:.1f} | "
f"**Avg Time/Step:** {mean_time_per_step:.2f}ms (StdDev: {stdev_time_per_step:.2f}ms) | "
f"**Timeouts:** {timeouts}"
)
full_verdict = f"{verdict_text}\n\n{stats_md}"
if DEBUG:
print("\n--- COMPUTATIONAL DYNAMICS & HALTING TEST FINAL RESULTS ---")
print(json.dumps(results, indent=2))
return full_verdict, results
# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 2.4") as demo:
gr.Markdown("# 🧠 BP-Φ Suite 2.4: Mechanistic Probes for Phenomenal-Candidate Behavior")
with gr.Tabs():
# --- TAB 1: WORKSPACE & ABLATIONS ---
with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
with gr.Row():
with gr.Column(scale=1):
ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
ws_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
with gr.Column(scale=2):
ws_verdict = gr.Markdown("### Results will appear here.")
ws_summary_df = gr.DataFrame(label="Summary Metrics")
with gr.Accordion("Raw JSON Output", open=False):
ws_raw_json = gr.JSON()
ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])
# --- TAB 2: COMPUTATIONAL DYNAMICS & HALTING ---
with gr.TabItem("2. Computational Dynamics & Halting"):
gr.Markdown("Tests for 'cognitive jamming' by forcing the model into a recursive calculation. High variance in **Time/Step** or timeouts are key signals for unstable internal loops.")
with gr.Row():
with gr.Column(scale=1):
ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
ch_prompt_type = gr.Radio(["control_math", "collatz_sequence"], label="Test Type", value="control_math")
ch_master_seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
ch_max_steps = gr.Slider(10, 200, 50, step=10, label="Max Steps per Run")
ch_timeout = gr.Slider(10, 300, 120, step=10, label="Total Timeout (seconds)")
ch_run_btn = gr.Button("Run Halting Dynamics Test", variant="primary")
with gr.Column(scale=2):
ch_verdict = gr.Markdown("### Results will appear here.")
with gr.Accordion("Raw Run Details (JSON)", open=False):
ch_results = gr.JSON()
ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_max_steps, ch_timeout], [ch_verdict, ch_results])
# --- TAB 3: COGNITIVE SEISMOGRAPH ---
with gr.TabItem("3. Cognitive Seismograph"):
gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
with gr.Row():
with gr.Column(scale=1):
cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
cs_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
with gr.Column(scale=2):
cs_results = gr.JSON(label="Activation Similarity Results")
cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)
# --- TAB 4: SYMBOLIC SHOCK TEST ---
with gr.TabItem("4. Symbolic Shock Test"):
gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
with gr.Row():
with gr.Column(scale=1):
ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
ss_run_btn = gr.Button("Run Shock Test", variant="primary")
with gr.Column(scale=2):
ss_results = gr.JSON(label="Shock Test Results")
ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|