File size: 4,095 Bytes
0916370
2f0addb
0916370
 
 
e40ba5b
0a1cc8d
2f0addb
88c294a
e40ba5b
88c294a
 
0916370
2f0addb
e40ba5b
 
 
 
 
 
 
 
 
 
 
88c294a
e40ba5b
0a1cc8d
e40ba5b
 
 
 
 
0a1cc8d
 
 
 
88c294a
0a1cc8d
 
 
e593b84
0a1cc8d
e40ba5b
 
 
 
25c13d7
e40ba5b
 
 
e593b84
e40ba5b
e593b84
0916370
0a1cc8d
 
 
 
 
 
 
e40ba5b
 
 
 
 
 
 
 
 
 
 
 
 
 
0a1cc8d
e40ba5b
 
 
 
 
 
 
2f0addb
0916370
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_agentic_workspace_test
from bp_phi.runner_utils import DEBUG

# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
    body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
    button_primary_background_fill="*primary_500", button_primary_text_color="white",
)

# --- Main Function ---
def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
    ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
    results = {}

    for i, ablation in enumerate(ablations):
        progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
        current_ablation = None if ablation == "baseline" else ablation
        result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
        results[ablation] = result

    progress(1.0, desc="Analysis complete.")

    # --- Analysis & Verdict ---
    base_recall = results["baseline"]["Overall_Recall_Accuracy"]
    recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]

    delta_phi = base_recall - recurrence_off_recall

    if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
                   "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
                   "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
    else:
        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
                   "Disabling the recurrent memory did not significantly impact recall accuracy. "
                   "This suggests the model is still relying on its internal context window, or the tasks are too simple.")

    # --- Format DataFrame ---
    df_data = []
    for ablation, result in results.items():
        df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
    df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])

    if DEBUG:
        print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
        print(json.dumps(results, indent=2))

    return verdict, df, results

# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
    gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
    gr.Markdown(
        "This definitive experiment tests for a causally effective working memory in LLMs. "
        "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
        "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Master Control")
            with gr.Group():
                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
                seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
                temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
            run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("### 📊 Verdict & Results")
            verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
            summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
            with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
                raw_json = gr.JSON()

    run_btn.click(
        fn=run_full_evaluation,
        inputs=[model_id, seed, temperature],
        outputs=[verdict_display, summary_df, raw_json]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)