Spaces:
Sleeping
Sleeping
File size: 4,095 Bytes
0916370 2f0addb 0916370 e40ba5b 0a1cc8d 2f0addb 88c294a e40ba5b 88c294a 0916370 2f0addb e40ba5b 88c294a e40ba5b 0a1cc8d e40ba5b 0a1cc8d 88c294a 0a1cc8d e593b84 0a1cc8d e40ba5b 25c13d7 e40ba5b e593b84 e40ba5b e593b84 0916370 0a1cc8d e40ba5b 0a1cc8d e40ba5b 2f0addb 0916370 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_agentic_workspace_test
from bp_phi.runner_utils import DEBUG
# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
button_primary_background_fill="*primary_500", button_primary_text_color="white",
)
# --- Main Function ---
def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
results = {}
for i, ablation in enumerate(ablations):
progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
current_ablation = None if ablation == "baseline" else ablation
result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
results[ablation] = result
progress(1.0, desc="Analysis complete.")
# --- Analysis & Verdict ---
base_recall = results["baseline"]["Overall_Recall_Accuracy"]
recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
delta_phi = base_recall - recurrence_off_recall
if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
"Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
"This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
else:
verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
"Disabling the recurrent memory did not significantly impact recall accuracy. "
"This suggests the model is still relying on its internal context window, or the tasks are too simple.")
# --- Format DataFrame ---
df_data = []
for ablation, result in results.items():
df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
if DEBUG:
print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
print(json.dumps(results, indent=2))
return verdict, df, results
# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
gr.Markdown(
"This definitive experiment tests for a causally effective working memory in LLMs. "
"The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
"We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Master Control")
with gr.Group():
model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### 📊 Verdict & Results")
verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
raw_json = gr.JSON()
run_btn.click(
fn=run_full_evaluation,
inputs=[model_id, seed, temperature],
outputs=[verdict_display, summary_df, raw_json]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|