llm_qualia_2 / app.py
neuralworm's picture
overhaul
e40ba5b
raw
history blame
3.27 kB
# app.py
import gradio as gr
import json
import statistics
import pandas as pd
from bp_phi.runner import run_agentic_workspace_test
DEBUG = 1
# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
button_primary_background_fill="*primary_500", button_primary_text_color="white",
)
# --- Main Function ---
def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
results = {}
for i, ablation in enumerate(ablations):
progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
current_ablation = None if ablation == "baseline" else ablation
result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
results[ablation] = result
progress(1.0, desc="Analysis complete.")
base_recall = results["baseline"]["Overall_Recall_Accuracy"]
recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
delta_phi = base_recall - recurrence_off_recall
if delta_phi > 0.5:
verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
else:
verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")
df_data = []
for ablation, result in results.items():
df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
if DEBUG:
print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
print(json.dumps(results, indent=2))
return verdict, df, results
# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Master Control")
with gr.Group():
model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### 📊 Verdict & Results")
verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
with gr.Accordion("Raw JSON Output", open=False):
raw_json = gr.JSON()
run_btn.click(
fn=run_full_evaluation,
inputs=[model_id, seed, temperature],
outputs=[verdict_display, summary_df, raw_json]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)