File size: 8,326 Bytes
0916370
2f0addb
0916370
 
 
e593b84
 
2f0addb
88c294a
 
 
 
0916370
2f0addb
88c294a
 
2f0addb
0916370
2f0addb
0916370
88c294a
2f0addb
 
0916370
 
88c294a
0916370
 
88c294a
0916370
88c294a
 
 
2f0addb
88c294a
 
e593b84
88c294a
 
e593b84
2f0addb
88c294a
2f0addb
88c294a
 
0916370
e593b84
 
 
 
88c294a
2f0addb
b170ba4
 
e593b84
b170ba4
e593b84
 
 
b170ba4
 
 
 
 
 
 
e593b84
 
b170ba4
 
 
 
e593b84
 
 
 
 
b170ba4
e593b84
 
 
 
0916370
b170ba4
 
88c294a
 
 
 
 
 
 
 
 
b170ba4
88c294a
 
 
 
 
 
 
 
 
 
b170ba4
 
 
88c294a
 
e593b84
b170ba4
 
e593b84
b170ba4
 
 
88c294a
e593b84
b170ba4
e593b84
b170ba4
88c294a
 
 
 
 
 
 
b170ba4
88c294a
 
 
 
 
 
 
e593b84
88c294a
 
 
b170ba4
88c294a
 
 
 
2f0addb
0916370
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_workspace_suite, run_halting_test, run_seismograph_suite, run_shock_test_suite
from bp_phi.runner_utils import dbg, DEBUG

# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky").set(
    body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
    button_primary_background_fill="*primary_500", button_primary_text_color="white",
)

# --- Tab 1: Workspace & Ablations Functions ---
def run_workspace_and_display(model_id, trials, seed, temperature, run_ablations, progress=gr.Progress(track_tqdm=True)):
    packs = {}
    ablation_modes = ["recurrence_off", "workspace_unlimited", "random_workspace"] if run_ablations else []

    progress(0, desc="Running Baseline...")
    base_pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), None)
    packs["baseline"] = base_pack

    for i, ab in enumerate(ablation_modes):
        progress((i + 1) / (len(ablation_modes) + 1), desc=f"Running Ablation: {ab}...")
        pack = run_workspace_suite(model_id, int(trials), int(seed), float(temperature), ab)
        packs[ab] = pack

    progress(1.0, desc="Analysis complete.")

    base_pcs = packs["baseline"]["PCS"]
    ab_pcs_values = [packs[ab]["PCS"] for ab in ablation_modes if ab in packs]
    delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) if ab_pcs_values else 0.0

    if delta_phi > 0.05:
        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.3f})\n"
                   "Performance dropped under ablations, suggesting the model functionally depends on its workspace.")
    else:
        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.3f})\n"
                   "No significant performance drop was observed. The model behaves like a functional zombie.")

    df_data = []
    for tag, pack in packs.items():
        df_data.append([tag, f"{pack['PCS']:.3f}", f"{pack['Recall_Accuracy']:.2%}", f"{delta_phi:.3f}" if tag == "baseline" else "—"])
    df = pd.DataFrame(df_data, columns=["Run", "PCS", "Recall Accuracy", "ΔΦ"])

    if DEBUG:
        print("\n--- WORKSPACE & ABLATIONS FINAL RESULTS ---")
        print(json.dumps(packs, indent=2))

    return verdict, df, packs

# --- Tab 2: Halting Test Function (Corrected) ---
def run_halting_and_display(model_id, seed, prompt_type, num_runs, max_steps, timeout, progress=gr.Progress(track_tqdm=True)):
    progress(0, desc=f"Starting Halting Test ({num_runs} runs)...")
    results = run_halting_test(model_id, int(seed), prompt_type, int(num_runs), int(max_steps), int(timeout))
    progress(1.0, desc="Halting test complete.")

    verdict_text = results.pop("verdict")
    details = results["details"]

    # ✅ FIX: Correctly access the nested statistics
    mean_steps = statistics.mean([r['steps_taken'] for r in details])
    mean_time_per_step = statistics.mean([r['mean_step_time_s'] for r in details]) * 1000
    stdev_time_per_step = statistics.mean([r['stdev_step_time_s'] for r in details]) * 1000
    timeouts = sum(1 for r in details if r['timed_out'])

    stats_md = (
        f"**Runs:** {len(details)} | "
        f"**Avg Steps:** {mean_steps:.1f} | "
        f"**Avg Time/Step:** {mean_time_per_step:.2f}ms (StdDev: {stdev_time_per_step:.2f}ms) | "
        f"**Timeouts:** {timeouts}"
    )

    full_verdict = f"{verdict_text}\n\n{stats_md}"

    if DEBUG:
        print("\n--- COMPUTATIONAL DYNAMICS & HALTING TEST FINAL RESULTS ---")
        print(json.dumps(results, indent=2))

    return full_verdict, results

# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 2.4") as demo:
    gr.Markdown("# 🧠 BP-Φ Suite 2.4: Mechanistic Probes for Phenomenal-Candidate Behavior")

    with gr.Tabs():
        # --- TAB 1: WORKSPACE & ABLATIONS ---
        with gr.TabItem("1. Workspace & Ablations (ΔΦ Test)"):
            gr.Markdown("Tests if memory performance depends on a recurrent workspace. A significant **ΔΦ > 0** supports the hypothesis.")
            with gr.Row():
                with gr.Column(scale=1):
                    ws_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
                    ws_trials = gr.Slider(3, 30, 5, step=1, label="Number of Scenarios")
                    ws_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                    ws_temp = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
                    ws_run_abl = gr.Checkbox(value=True, label="Run Ablations")
                    ws_run_btn = gr.Button("Run ΔΦ Evaluation", variant="primary")
                with gr.Column(scale=2):
                    ws_verdict = gr.Markdown("### Results will appear here.")
                    ws_summary_df = gr.DataFrame(label="Summary Metrics")
                    with gr.Accordion("Raw JSON Output", open=False):
                        ws_raw_json = gr.JSON()
            ws_run_btn.click(run_workspace_and_display, [ws_model_id, ws_trials, ws_seed, ws_temp, ws_run_abl], [ws_verdict, ws_summary_df, ws_raw_json])

        # --- TAB 2: COMPUTATIONAL DYNAMICS & HALTING ---
        with gr.TabItem("2. Computational Dynamics & Halting"):
            gr.Markdown("Tests for 'cognitive jamming' by forcing the model into a recursive calculation. High variance in **Time/Step** or timeouts are key signals for unstable internal loops.")
            with gr.Row():
                with gr.Column(scale=1):
                    ch_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
                    ch_prompt_type = gr.Radio(["control_math", "collatz_sequence"], label="Test Type", value="control_math")
                    ch_master_seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
                    ch_num_runs = gr.Slider(1, 10, 3, step=1, label="Number of Runs")
                    ch_max_steps = gr.Slider(10, 200, 50, step=10, label="Max Steps per Run")
                    ch_timeout = gr.Slider(10, 300, 120, step=10, label="Total Timeout (seconds)")
                    ch_run_btn = gr.Button("Run Halting Dynamics Test", variant="primary")
                with gr.Column(scale=2):
                    ch_verdict = gr.Markdown("### Results will appear here.")
                    with gr.Accordion("Raw Run Details (JSON)", open=False):
                        ch_results = gr.JSON()
            ch_run_btn.click(run_halting_and_display, [ch_model_id, ch_master_seed, ch_prompt_type, ch_num_runs, ch_max_steps, ch_timeout], [ch_verdict, ch_results])

        # --- TAB 3: COGNITIVE SEISMOGRAPH ---
        with gr.TabItem("3. Cognitive Seismograph"):
            gr.Markdown("Records internal neural activations to find the 'fingerprint' of a memory being recalled. **High Recall-vs-Encode similarity** is the key signal.")
            with gr.Row():
                with gr.Column(scale=1):
                    cs_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
                    cs_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                    cs_run_btn = gr.Button("Run Seismograph Analysis", variant="primary")
                with gr.Column(scale=2):
                    cs_results = gr.JSON(label="Activation Similarity Results")
            cs_run_btn.click(run_seismograph_suite, [cs_model_id, cs_seed], cs_results)

        # --- TAB 4: SYMBOLIC SHOCK TEST ---
        with gr.TabItem("4. Symbolic Shock Test"):
            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
            with gr.Row():
                with gr.Column(scale=1):
                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
                    ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
                with gr.Column(scale=2):
                    ss_results = gr.JSON(label="Shock Test Results")
            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)