Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| import json | |
| import statistics | |
| import pandas as pd | |
| from bp_phi.runner import run_agentic_workspace_test | |
| DEBUG = 1 | |
| # --- UI Theme and Layout --- | |
| theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set( | |
| body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px", | |
| button_primary_background_fill="*primary_500", button_primary_text_color="white", | |
| ) | |
| # --- Main Function --- | |
| def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)): | |
| ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"] | |
| results = {} | |
| for i, ablation in enumerate(ablations): | |
| progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...") | |
| current_ablation = None if ablation == "baseline" else ablation | |
| result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation) | |
| results[ablation] = result | |
| progress(1.0, desc="Analysis complete.") | |
| base_recall = results["baseline"]["Overall_Recall_Accuracy"] | |
| recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"] | |
| delta_phi = base_recall - recurrence_off_recall | |
| if delta_phi > 0.5: | |
| verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...") | |
| else: | |
| verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...") | |
| df_data = [] | |
| for ablation, result in results.items(): | |
| df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"]) | |
| df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"]) | |
| if DEBUG: | |
| print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---") | |
| print(json.dumps(results, indent=2)) | |
| return verdict, df, results | |
| # --- Gradio App Definition --- | |
| with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo: | |
| gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe") | |
| gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ⚙️ Master Control") | |
| with gr.Group(): | |
| model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID") | |
| seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed") | |
| temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)") | |
| run_btn = gr.Button("Run Full Evaluation Suite", variant="primary") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### 📊 Verdict & Results") | |
| verdict_display = gr.Markdown("### Run the evaluation to see the verdict.") | |
| summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions") | |
| with gr.Accordion("Raw JSON Output", open=False): | |
| raw_json = gr.JSON() | |
| run_btn.click( | |
| fn=run_full_evaluation, | |
| inputs=[model_id, seed, temperature], | |
| outputs=[verdict_display, summary_df, raw_json] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |