llm_qualia_2

Sleeping

App Files Files Community

llm_qualia_2 / app.py

neuralworm

overhaul

e40ba5b about 1 month ago

raw

history blame

3.27 kB

	# app.py
	import gradio as gr
	import json
	import statistics
	import pandas as pd
	from bp_phi.runner import run_agentic_workspace_test

	DEBUG = 1

	# --- UI Theme and Layout ---
	theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
	body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
	button_primary_background_fill="*primary_500", button_primary_text_color="white",
	)

	# --- Main Function ---
	def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
	ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
	results = {}

	for i, ablation in enumerate(ablations):
	progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
	current_ablation = None if ablation == "baseline" else ablation
	result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
	results[ablation] = result

	progress(1.0, desc="Analysis complete.")

	base_recall = results["baseline"]["Overall_Recall_Accuracy"]
	recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]

	delta_phi = base_recall - recurrence_off_recall

	if delta_phi > 0.5:
	verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n...")
	else:
	verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n...")

	df_data = []
	for ablation, result in results.items():
	df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
	df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])

	if DEBUG:
	print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
	print(json.dumps(results, indent=2))

	return verdict, df, results

	# --- Gradio App Definition ---
	with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
	gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
	gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Master Control")
	with gr.Group():
	model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
	seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
	temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
	run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")

	with gr.Column(scale=2):
	gr.Markdown("### 📊 Verdict & Results")
	verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
	summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
	with gr.Accordion("Raw JSON Output", open=False):
	raw_json = gr.JSON()

	run_btn.click(
	fn=run_full_evaluation,
	inputs=[model_id, seed, temperature],
	outputs=[verdict_display, summary_df, raw_json]
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)