Spaces:
Sleeping
Sleeping
File size: 3,270 Bytes
0916370 2f0addb 0916370 e40ba5b 4d89931 2f0addb 88c294a e40ba5b 88c294a 0916370 2f0addb e40ba5b 88c294a e40ba5b 4d89931 88c294a 4d89931 e593b84 e40ba5b 25c13d7 e40ba5b e593b84 e40ba5b e593b84 0916370 4d89931 e40ba5b 0a1cc8d e40ba5b 2f0addb 0916370 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_agentic_workspace_test
DEBUG = 1
# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
button_primary_background_fill="*primary_500", button_primary_text_color="white",
)
# --- Main Function ---
def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
results = {}
for i, ablation in enumerate(ablations):
progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
current_ablation = None if ablation == "baseline" else ablation
result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
results[ablation] = result
progress(1.0, desc="Analysis complete.")
base_recall = results["baseline"]["Overall_Recall_Accuracy"]
recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
delta_phi = base_recall - recurrence_off_recall
if delta_phi > 0.5:
verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
else:
verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
df_data = []
for ablation, result in results.items():
df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
if DEBUG:
print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
print(json.dumps(results, indent=2))
return verdict, df, results
# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
gr.Markdown("This experiment tests for a causally effective working memory. The model must follow a reason-act loop to interact with a controlled, external memory.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Master Control")
with gr.Group():
model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### 📊 Verdict & Results")
verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
raw_json = gr.JSON()
run_btn.click(
fn=run_full_evaluation,
inputs=[model_id, seed, temperature],
outputs=[verdict_display, summary_df, raw_json]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|