File size: 3,270 Bytes
0916370
2f0addb
0916370
 
 
e40ba5b
4d89931
 
2f0addb
88c294a
e40ba5b
88c294a
 
0916370
2f0addb
e40ba5b
 
 
 
 
 
 
 
 
 
 
88c294a
e40ba5b
 
 
 
 
 
4d89931
 
88c294a
4d89931
e593b84
e40ba5b
 
 
 
25c13d7
e40ba5b
 
 
e593b84
e40ba5b
e593b84
0916370
4d89931
 
 
e40ba5b
 
 
 
 
 
 
 
 
 
 
 
 
 
0a1cc8d
e40ba5b
 
 
 
 
 
 
2f0addb
0916370
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_agentic_workspace_test

DEBUG = 1

# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
    body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
    button_primary_background_fill="*primary_500", button_primary_text_color="white",
)

# --- Main Function ---
def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
    ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
    results = {}

    for i, ablation in enumerate(ablations):
        progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
        current_ablation = None if ablation == "baseline" else ablation
        result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
        results[ablation] = result

    progress(1.0, desc="Analysis complete.")

    base_recall = results["baseline"]["Overall_Recall_Accuracy"]
    recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]

    delta_phi = base_recall - recurrence_off_recall

    if delta_phi > 0.5:
        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
    else:
        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")

    df_data = []
    for ablation, result in results.items():
        df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
    df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])

    if DEBUG:
        print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
        print(json.dumps(results, indent=2))

    return verdict, df, results

# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
    gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
    gr.Markdown("This experiment tests for a causally effective working memory. The model must follow a reason-act loop to interact with a controlled, external memory.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Master Control")
            with gr.Group():
                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
                seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
                temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
            run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("### 📊 Verdict & Results")
            verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
            summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
            with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
                raw_json = gr.JSON()

    run_btn.click(
        fn=run_full_evaluation,
        inputs=[model_id, seed, temperature],
        outputs=[verdict_display, summary_df, raw_json]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)