Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json, statistics | |
| from bp_phi.runner import run_suite | |
| ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"] | |
| def run_all(model_id, trials, temperature, run_ablations): | |
| out_texts = [] | |
| packs = {} | |
| # Baseline | |
| base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None) | |
| packs["baseline"] = base_pack | |
| out_texts.append("✅ Baseline done") | |
| if run_ablations: | |
| for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]: | |
| pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab) | |
| packs[ab] = pack | |
| out_texts.append(f"✅ Ablation {ab} done") | |
| # Compute DeltaPhi if possible | |
| base_pcs = packs["baseline"]["summary"]["PCS"] | |
| ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None] | |
| delta_phi = None | |
| if base_pcs is not None and ab_pcs_values: | |
| delta_phi = float(base_pcs - statistics.mean(ab_pcs_values)) | |
| packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi | |
| # Summary view | |
| rows = [] | |
| for tag, pack in packs.items(): | |
| s = pack["summary"] | |
| m = s["metrics"] | |
| rows.append([ | |
| tag, | |
| s["trials"], | |
| f"{s['ablation']}", | |
| f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}", | |
| f"{m['ECE'] if m['ECE'] is not None else '—'}", | |
| f"{m['CK']:.3f}", | |
| f"{m['DS']:.2f}", | |
| f"{s['PCS']:.3f}" if s["PCS"] is not None else "—", | |
| f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—" | |
| ]) | |
| header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"] | |
| table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows]) | |
| return "\n".join(out_texts), table, json.dumps(packs, indent=2) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.") | |
| with gr.Row(): | |
| model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2) | |
| trials = gr.Slider(10, 200, 40, step=10, label="Trials") | |
| temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature") | |
| run_abl = gr.Checkbox(value=True, label="Run ablations") | |
| run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary") | |
| status = gr.Textbox(label="Status", lines=4) | |
| summary_table = gr.Textbox(label="Summary Table", lines=12) | |
| raw = gr.Textbox(label="Raw JSON (all runs)", lines=20) | |
| run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw]) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |