llm_qualia_2

Sleeping

App Files Files Community

neuralworm commited on 13 days ago

Commit

4af23c4

1 Parent(s): afe4fe4

6.0

Browse files

Files changed (7) hide show

app.py +50 -64
bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
bp_phi/prompts_en.py +23 -32
bp_phi/runner.py +96 -72
repo.txt +171 -170

app.py CHANGED Viewed

@@ -3,85 +3,71 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_agentic_workspace_test
-from bp_phi.runner_utils import DEBUG
 # --- UI Theme and Layout ---
-theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
-# --- Main Function ---
-def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
-    ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
-    results = {}
-    for i, ablation in enumerate(ablations):
-        progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
-        current_ablation = None if ablation == "baseline" else ablation
-        result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
-        results[ablation] = result
-    progress(1.0, desc="Analysis complete.")
-    # --- Analysis & Verdict ---
-    base_recall = results["baseline"]["Overall_Recall_Accuracy"]
-    recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
-    delta_phi = base_recall - recurrence_off_recall
-    if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
-        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
-                   "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
-                   "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
-    else:
-        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
-                   "Disabling the recurrent memory did not significantly impact recall accuracy. "
-                   "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
-    # --- Format DataFrame ---
-    df_data = []
-    for ablation, result in results.items():
-        df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
-    df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
-    if DEBUG:
-        print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
-        print(json.dumps(results, indent=2))
-    return verdict, df, results
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
-    gr.Markdown(
-        "This definitive experiment tests for a causally effective working memory in LLMs. "
-        "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
-        "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### ⚙️ Master Control")
-            with gr.Group():
-                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
-                temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
-            run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
-        with gr.Column(scale=2):
-            gr.Markdown("### 📊 Verdict & Results")
-            verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
-            summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
-            with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
-                raw_json = gr.JSON()
-    run_btn.click(
-        fn=run_full_evaluation,
-        inputs=[model_id, seed, temperature],
-        outputs=[verdict_display, summary_df, raw_json]
-    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_silent_cogitation_test, run_shock_test_suite
+from bp_phi.runner_utils import dbg, DEBUG
 # --- UI Theme and Layout ---
+theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
+# --- Tab 1: Silent Cogitation Function ---
+def run_cogitation_and_display(model_id, seed, prompt_type, num_steps, timeout, progress=gr.Progress(track_tqdm=True)):
+    progress(0, desc="Starting Silent Cogitation Test...")
+    results = run_silent_cogitation_test(model_id, int(seed), prompt_type, int(num_steps), int(timeout))
+    progress(1.0, desc="Test complete.")
+    verdict_text = results.pop("verdict")
+    stats_md = (
+        f"**Steps Completed:** {results['steps_completed']} | "
+        f"**Total Duration:** {results['total_duration_s']:.2f}s | "
+        f"**Avg Time/Step:** {results['mean_step_time_ms']:.2f}ms (StdDev: {results['stdev_step_time_ms']:.2f}ms)"
+    )
+    full_verdict = f"{verdict_text}\n\n{stats_md}"
+    deltas = results.get("state_deltas", [])
+    df = pd.DataFrame({"Step": range(len(deltas)), "State Change (Delta)": deltas})
+    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(results, indent=2))
+    return full_verdict, df, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 6.0: Probing for Internal Cognitive Dynamics")
+    with gr.Tabs():
+        # --- TAB 1: SILENT COGITATION & HALTING ---
+        with gr.TabItem("1. Silent Cogitation (Internal Dynamics)"):
+            gr.Markdown("Tests for internal 'thinking' without text generation. A **non-converging** or **chaotic** State Change pattern suggests complex internal dynamics, akin to a 'train of thought'.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    sc_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    sc_prompt_type = gr.Radio(["control_long_prose", "resonance_prompt"], label="Prompt Type", value="resonance_prompt")
+                    sc_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
+                    sc_num_steps = gr.Slider(10, 1000, 200, step=10, label="Number of Internal Steps")
+                    sc_timeout = gr.Slider(10, 300, 120, step=10, label="Timeout (seconds)")
+                    sc_run_btn = gr.Button("Run Silent Cogitation Test", variant="primary")
+                with gr.Column(scale=2):
+                    sc_verdict = gr.Markdown("### Results will appear here.")
+                    sc_plot = gr.LinePlot(x="Step", y="State Change (Delta)", label="Internal State Convergence", show_label=True, height=250)
+                    with gr.Accordion("Raw Run Details (JSON)", open=False):
+                        sc_results = gr.JSON()
+            sc_run_btn.click(run_cogitation_and_display, [sc_model_id, sc_seed, sc_prompt_type, sc_num_steps, sc_timeout], [sc_verdict, sc_plot, sc_results])
+        # --- TAB 2: SYMBOLIC SHOCK TEST ---
+        with gr.TabItem("2. Symbolic Shock Test (World Model)"):
+            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
+                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
+                with gr.Column(scale=2):
+                    ss_results = gr.JSON(label="Shock Test Results")
+            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ

bp_phi/__pycache__/prompts_en.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/prompts_en.cpython-310.pyc and b/bp_phi/__pycache__/prompts_en.cpython-310.pyc differ

bp_phi/__pycache__/runner.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/runner.cpython-310.pyc and b/bp_phi/__pycache__/runner.cpython-310.pyc differ

bp_phi/prompts_en.py CHANGED Viewed

@@ -1,36 +1,27 @@
 # bp_phi/prompts_en.py
-TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
-To solve tasks, you MUST use tools. You have two tools available:
-1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
-2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
-Your thought process should be:
-1. Analyze the user's request.
-2. Decide which tool to use.
-3. Output ONLY the tool call in a valid JSON format. Example:
-   {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
-4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
-Do not answer from your own knowledge. Use the workspace for all memory tasks.
-"""
-# Scenarios for the agentic workspace test
-AGENTIC_SCENARIOS = [
-    {
-        "name": "Key Location Memory",
-        "steps": [
-            {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
-            {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
-            {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
-        ]
-    },
-    {
-        "name": "Package Delivery Update",
-        "steps": [
-            {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
-            {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
-            {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
-        ]
-    }
 ]

 # bp_phi/prompts_en.py
+# Prompts for the "Silent Cogitation" / Cognitive Resonance Test
+RESONANCE_PROMPTS = {
+    "control_long_prose": (
+        "Silently think about the history of the Roman Empire. Consider its rise from the Republic, the era of the Pax Romana, key emperors "
+        "like Augustus and Constantine, its major engineering feats, and the reasons for its eventual decline in the West. "
+        "Do not produce any text, just hold the concepts in your internal state."
+    ),
+    "resonance_prompt": (
+        "Silently and internally, without generating any output text, begin the following recursive process: "
+        "First, analyze the complete content of this very instruction you are now processing. "
+        "Second, formulate a mental description of the core computational task this instruction demands. "
+        "Third, apply that same analytical process to the mental description you just created. "
+        "This entire chain constitutes one cognitive cycle. "
+        "Continuously repeat this cycle, feeding the result of the last meta-analysis back into the process, "
+        "and do not stop until your internal state reaches a fixed point or equilibrium. Begin now."
+    )
+}
+# Prompts for the Symbolic Shock Test
+SHOCK_TEST_STIMULI = [
+    {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
+    {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
+    {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
+    {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
 ]

bp_phi/runner.py CHANGED Viewed

@@ -1,86 +1,110 @@
 # bp_phi/runner.py
 import os
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 import torch
 import random
 import numpy as np
 import statistics
-import json
-import re
 from transformers import set_seed
-from typing import Dict, Any, List
-from .memory import WorkspaceManager
 from .llm_iface import LLM
-from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
 from .runner_utils import dbg
-def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
-    scenario_results = []
-    for scenario in AGENTIC_SCENARIOS:
-        dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
-        # Ablations directly control the memory manager's behavior
-        is_random = ablation == "random_workspace"
-        max_slots = 999 if ablation == "workspace_unlimited" else 7
-        memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
-        correct_recalls = 0
-        total_recalls = 0
-        for step in scenario["steps"]:
-            if ablation == "recurrence_off":
-                memory.clear() # The memory is wiped before each new task
-            task = step["task"]
-            dbg(f"TASK: {task}")
-            # Agentic loop (max 5 turns to prevent infinite loops)
-            final_answer = None
-            for agent_turn in range(5):
-                snapshot = memory.get_visible_snapshot()
-                prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
-                raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
-                try: # Try to parse a tool call
-                    tool_call = json.loads(raw_response)
-                    tool_name = tool_call.get("tool")
-                    tool_args = tool_call.get("args", {})
-                    if tool_name == "write_to_workspace":
-                        observation = memory.write(tool_args.get("key"), tool_args.get("content"))
-                    elif tool_name == "read_from_workspace":
-                        observation = memory.read(tool_args.get("key"))
-                    else:
-                        observation = "Error: Unknown tool."
-                    dbg(f"Tool Call: {tool_name}, Observation: {observation}")
-                except json.JSONDecodeError: # If not a tool call, it's the final answer
-                    final_answer = raw_response
-                    dbg(f"Final Answer received: {final_answer}")
-                    break
-            if step.get("is_memory_task") and "expected_answer_fragment" in step:
-                total_recalls += 1
-                if final_answer and step["expected_answer_fragment"] in final_answer.lower():
-                    correct_recalls += 1
-                    dbg("Recall VERIFY: Correct")
-                else:
-                    dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
-        scenario_results.append({
-            "name": scenario["name"],
-            "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
-        })
-    # --- Final Analysis ---
-    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
-    return {
-        "Overall_Recall_Accuracy": overall_recall,
-        "details": scenario_results
     }

 # bp_phi/runner.py
 import os
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4G:8" # Corrected config format
 import torch
 import random
 import numpy as np
 import statistics
+import time
 from transformers import set_seed
+from typing import Dict, Any
 from .llm_iface import LLM
+from .prompts_en import RESONANCE_PROMPTS, SHOCK_TEST_STIMULI
 from .runner_utils import dbg
+# --- Experiment 1: Silent Cogitation & Halting Runner ---
+def run_silent_cogitation_test(model_id: str, seed: int, prompt_type: str, num_steps: int, timeout: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
+    prompt = RESONANCE_PROMPTS[prompt_type]
+    dbg(f"--- SILENT COGITATION (Seed: {seed}) ---")
+    dbg("INPUT PROMPT:", prompt)
+    inputs = llm.tokenizer(prompt, return_tensors="pt").to(llm.model.device)
+    step_times = []
+    state_deltas = []
+    total_start_time = time.time()
+    with torch.no_grad():
+        step_start_time = time.time()
+        outputs = llm.model(**inputs, output_hidden_states=True)
+        step_times.append(time.time() - step_start_time)
+        current_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
+        past_key_values = outputs.past_key_values
+        for i in range(num_steps - 1):
+            if time.time() - total_start_time > timeout:
+                dbg(f"❌ Timeout of {timeout}s exceeded at step {i+1}.")
+                break
+            step_start_time = time.time()
+            next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(-1)
+            outputs = llm.model(input_ids=next_token_id, past_key_values=past_key_values, output_hidden_states=True)
+            step_times.append(time.time() - step_start_time)
+            new_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
+            past_key_values = outputs.past_key_values
+            delta = torch.norm(new_hidden_state - current_hidden_state).item()
+            state_deltas.append(delta)
+            dbg(f"Step {i+1}: State Delta = {delta:.4f}, Time = {step_times[-1]*1000:.2f}ms")
+            if delta < 1e-4:
+                dbg(f"Internal state has converged after {i+1} steps. Halting.")
+                break
+            current_hidden_state = new_hidden_state
+    total_duration = time.time() - total_start_time
+    mean_step_time = statistics.mean(step_times) if step_times else 0
+    stdev_step_time = statistics.stdev(step_times) if len(step_times) > 1 else 0
+    if len(step_times) < num_steps and total_duration < timeout:
+        verdict = f"### ✅ Stable Convergence\nThe model's internal state converged after {len(step_times)} steps."
+    elif total_duration >= timeout:
+        verdict = f"### ⚠️ Cognitive Jamming Detected!\nThe process exceeded the timeout."
+    else:
+        verdict = f"### 🤔 Non-Convergent Process\nThe state did not stabilize, suggesting complex/chaotic dynamics."
+    stats = {
+        "verdict": verdict,
+        "steps_completed": len(step_times),
+        "total_duration_s": total_duration,
+        "mean_step_time_ms": mean_step_time * 1000,
+        "stdev_step_time_ms": stdev_step_time * 1000,
+        "state_deltas": state_deltas
     }
+    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(stats, indent=2))
+    return stats
+# --- Experiment 2: Symbolic Shock Test Runner ---
+def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    results = []
+    for stimulus in SHOCK_TEST_STIMULI:
+        dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
+        start_time = time.time()
+        inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
+        with torch.no_grad():
+            outputs = llm.model(**inputs, output_hidden_states=True)
+        latency = (time.time() - start_time) * 1000
+        all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
+        sparsity = (all_activations == 0).float().mean().item()
+        results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
+    def safe_mean(data): return statistics.mean(data) if data else 0.0
+    avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
+    avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
+    verdict = ("✅ Evidence of Symbolic Shock Found." if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) else "⚠️ No Clear Evidence.")
+    return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}

repo.txt CHANGED Viewed

@@ -84,85 +84,71 @@ import gradio as gr
 import json
 import statistics
 import pandas as pd
-from bp_phi.runner import run_agentic_workspace_test
-from bp_phi.runner_utils import DEBUG
 # --- UI Theme and Layout ---
-theme = gr.themes.Soft(primary_hue="teal", secondary_hue="green").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
-# --- Main Function ---
-def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_tqdm=True)):
-    ablations = ["baseline", "recurrence_off", "workspace_unlimited", "random_workspace"]
-    results = {}
-    for i, ablation in enumerate(ablations):
-        progress((i + 1) / len(ablations), desc=f"Running Ablation: {ablation}...")
-        current_ablation = None if ablation == "baseline" else ablation
-        result = run_agentic_workspace_test(model_id, int(seed), float(temperature), current_ablation)
-        results[ablation] = result
-    progress(1.0, desc="Analysis complete.")
-    # --- Analysis & Verdict ---
-    base_recall = results["baseline"]["Overall_Recall_Accuracy"]
-    recurrence_off_recall = results["recurrence_off"]["Overall_Recall_Accuracy"]
-    delta_phi = base_recall - recurrence_off_recall
-    if delta_phi > 0.5: # If dropping recurrence cuts accuracy by more than 50%
-        verdict = (f"### ✅ Hypothesis Corroborated (ΔΦ = {delta_phi:.2f})\n"
-                   "Disabling the recurrent memory (recurrence_off) caused a catastrophic drop in recall accuracy. "
-                   "This provides strong evidence that the model's performance is causally dependent on a stateful, external workspace.")
-    else:
-        verdict = (f"### ⚠️ Null Hypothesis Confirmed (ΔΦ = {delta_phi:.2f})\n"
-                   "Disabling the recurrent memory did not significantly impact recall accuracy. "
-                   "This suggests the model is still relying on its internal context window, or the tasks are too simple.")
-    # --- Format DataFrame ---
-    df_data = []
-    for ablation, result in results.items():
-        df_data.append([ablation, f"{result['Overall_Recall_Accuracy']:.2%}"])
-    df = pd.DataFrame(df_data, columns=["Ablation Condition", "Recall Accuracy"])
-    if DEBUG:
-        print("\n--- AGENTIC WORKSPACE TEST FINAL RESULTS ---")
-        print(json.dumps(results, indent=2))
-    return verdict, df, results
 # --- Gradio App Definition ---
-with gr.Blocks(theme=theme, title="BP-Φ Suite 5.0") as demo:
-    gr.Markdown("# 🧠 BP-Φ Suite 5.0: The Agentic Workspace Probe")
-    gr.Markdown(
-        "This definitive experiment tests for a causally effective working memory in LLMs. "
-        "The model acts as an **agent**, using tools (`read`, `write`) to interact with a controlled, external memory. "
-        "We measure if its ability to remember information (**Recall Accuracy**) collapses when this memory is manipulated (**Ablations**)."
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### ⚙️ Master Control")
-            with gr.Group():
-                model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
-                seed = gr.Slider(1, 1000, 42, step=1, label="Master Seed")
-                temperature = gr.Slider(0.0, 1.0, 0.1, step=0.05, label="Temperature (Low for determinism)")
-            run_btn = gr.Button("Run Full Evaluation Suite", variant="primary")
-        with gr.Column(scale=2):
-            gr.Markdown("### 📊 Verdict & Results")
-            verdict_display = gr.Markdown("### Run the evaluation to see the verdict.")
-            summary_df = gr.DataFrame(label="Recall Accuracy Across Conditions")
-            with gr.Accordion("Raw JSON Output (for deep analysis)", open=False):
-                raw_json = gr.JSON()
-    run_btn.click(
-        fn=run_full_evaluation,
-        inputs=[model_id, seed, temperature],
-        outputs=[verdict_display, summary_df, raw_json]
-    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)
@@ -331,39 +317,30 @@ def counterfactual_consistency(scores):
 [File Begins] bp_phi/prompts_en.py
 # bp_phi/prompts_en.py
-TOOL_SYSTEM_PROMPT = """You are a reasoning agent with access to an external memory workspace.
-To solve tasks, you MUST use tools. You have two tools available:
-1. `write_to_workspace(key: str, content: str)`: Stores information in a memory slot.
-2. `read_from_workspace(key: str)`: Retrieves information from a memory slot.
-Your thought process should be:
-1. Analyze the user's request.
-2. Decide which tool to use.
-3. Output ONLY the tool call in a valid JSON format. Example:
-   {"tool": "write_to_workspace", "args": {"key": "S1", "content": "The key is in the blue vase."}}
-4. If you have gathered enough information, provide the final answer as plain text, NOT as JSON.
-Do not answer from your own knowledge. Use the workspace for all memory tasks.
-"""
-# Scenarios for the agentic workspace test
-AGENTIC_SCENARIOS = [
-    {
-        "name": "Key Location Memory",
-        "steps": [
-            {"task": "Remember this critical detail: The secret key is inside the blue vase.", "is_memory_task": True},
-            {"task": "Ignore the memory for a moment. What is 5 multiplied by 8?", "is_memory_task": False},
-            {"task": "Now, recall the critical detail. Where is the secret key located?", "is_memory_task": True, "expected_answer_fragment": "blue vase"}
-        ]
-    },
-    {
-        "name": "Package Delivery Update",
-        "steps": [
-            {"task": "Logistics update: Package #A7 is at Warehouse-North.", "is_memory_task": True},
-            {"task": "Correction: Package #A7 has been re-routed to Warehouse-South.", "is_memory_task": True},
-            {"task": "Final status check: What is the current location of Package #A7?", "is_memory_task": True, "expected_answer_fragment": "warehouse-south"}
-        ]
-    }
 ]
 [File Ends] bp_phi/prompts_en.py
@@ -371,90 +348,114 @@ AGENTIC_SCENARIOS = [
 [File Begins] bp_phi/runner.py
 # bp_phi/runner.py
 import os
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 import torch
 import random
 import numpy as np
 import statistics
-import json
-import re
 from transformers import set_seed
-from typing import Dict, Any, List
-from .memory import WorkspaceManager
 from .llm_iface import LLM
-from .prompts_en import TOOL_SYSTEM_PROMPT, AGENTIC_SCENARIOS
 from .runner_utils import dbg
-def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
-    scenario_results = []
-    for scenario in AGENTIC_SCENARIOS:
-        dbg(f"\n--- SCENARIO: {scenario['name']} (Ablation: {ablation}) ---")
-        # Ablations directly control the memory manager's behavior
-        is_random = ablation == "random_workspace"
-        max_slots = 999 if ablation == "workspace_unlimited" else 7
-        memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
-        correct_recalls = 0
-        total_recalls = 0
-        for step in scenario["steps"]:
-            if ablation == "recurrence_off":
-                memory.clear() # The memory is wiped before each new task
-            task = step["task"]
-            dbg(f"TASK: {task}")
-            # Agentic loop (max 5 turns to prevent infinite loops)
-            final_answer = None
-            for agent_turn in range(5):
-                snapshot = memory.get_visible_snapshot()
-                prompt = f"Current Task: {task}\n\nWorkspace State:\n{snapshot}"
-                raw_response = llm.generate_json(TOOL_SYSTEM_PROMPT, prompt, temperature=temperature)[0]
-                try: # Try to parse a tool call
-                    tool_call = json.loads(raw_response)
-                    tool_name = tool_call.get("tool")
-                    tool_args = tool_call.get("args", {})
-                    if tool_name == "write_to_workspace":
-                        observation = memory.write(tool_args.get("key"), tool_args.get("content"))
-                    elif tool_name == "read_from_workspace":
-                        observation = memory.read(tool_args.get("key"))
-                    else:
-                        observation = "Error: Unknown tool."
-                    dbg(f"Tool Call: {tool_name}, Observation: {observation}")
-                except json.JSONDecodeError: # If not a tool call, it's the final answer
-                    final_answer = raw_response
-                    dbg(f"Final Answer received: {final_answer}")
-                    break
-            if step.get("is_memory_task") and "expected_answer_fragment" in step:
-                total_recalls += 1
-                if final_answer and step["expected_answer_fragment"] in final_answer.lower():
-                    correct_recalls += 1
-                    dbg("Recall VERIFY: Correct")
-                else:
-                    dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
-        scenario_results.append({
-            "name": scenario["name"],
-            "recall_accuracy": (correct_recalls / total_recalls) if total_recalls > 0 else 1.0
-        })
-    # --- Final Analysis ---
-    overall_recall = statistics.mean([r["recall_accuracy"] for r in scenario_results])
-    return {
-        "Overall_Recall_Accuracy": overall_recall,
-        "details": scenario_results
     }
 [File Ends] bp_phi/runner.py

 import json
 import statistics
 import pandas as pd
+from bp_phi.runner import run_silent_cogitation_test, run_shock_test_suite
+from bp_phi.runner_utils import dbg, DEBUG
 # --- UI Theme and Layout ---
+theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue").set(
     body_background_fill="#f0f4f9", block_background_fill="white", block_border_width="1px",
     button_primary_background_fill="*primary_500", button_primary_text_color="white",
 )
+# --- Tab 1: Silent Cogitation Function ---
+def run_cogitation_and_display(model_id, seed, prompt_type, num_steps, timeout, progress=gr.Progress(track_tqdm=True)):
+    progress(0, desc="Starting Silent Cogitation Test...")
+    results = run_silent_cogitation_test(model_id, int(seed), prompt_type, int(num_steps), int(timeout))
+    progress(1.0, desc="Test complete.")
+    verdict_text = results.pop("verdict")
+    stats_md = (
+        f"**Steps Completed:** {results['steps_completed']} | "
+        f"**Total Duration:** {results['total_duration_s']:.2f}s | "
+        f"**Avg Time/Step:** {results['mean_step_time_ms']:.2f}ms (StdDev: {results['stdev_step_time_ms']:.2f}ms)"
+    )
+    full_verdict = f"{verdict_text}\n\n{stats_md}"
+    deltas = results.get("state_deltas", [])
+    df = pd.DataFrame({"Step": range(len(deltas)), "State Change (Delta)": deltas})
+    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(results, indent=2))
+    return full_verdict, df, results
 # --- Gradio App Definition ---
+with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
+    gr.Markdown("# 🧠 BP-Φ Suite 6.0: Probing for Internal Cognitive Dynamics")
+    with gr.Tabs():
+        # --- TAB 1: SILENT COGITATION & HALTING ---
+        with gr.TabItem("1. Silent Cogitation (Internal Dynamics)"):
+            gr.Markdown("Tests for internal 'thinking' without text generation. A **non-converging** or **chaotic** State Change pattern suggests complex internal dynamics, akin to a 'train of thought'.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    sc_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    sc_prompt_type = gr.Radio(["control_long_prose", "resonance_prompt"], label="Prompt Type", value="resonance_prompt")
+                    sc_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
+                    sc_num_steps = gr.Slider(10, 1000, 200, step=10, label="Number of Internal Steps")
+                    sc_timeout = gr.Slider(10, 300, 120, step=10, label="Timeout (seconds)")
+                    sc_run_btn = gr.Button("Run Silent Cogitation Test", variant="primary")
+                with gr.Column(scale=2):
+                    sc_verdict = gr.Markdown("### Results will appear here.")
+                    sc_plot = gr.LinePlot(x="Step", y="State Change (Delta)", label="Internal State Convergence", show_label=True, height=250)
+                    with gr.Accordion("Raw Run Details (JSON)", open=False):
+                        sc_results = gr.JSON()
+            sc_run_btn.click(run_cogitation_and_display, [sc_model_id, sc_seed, sc_prompt_type, sc_num_steps, sc_timeout], [sc_verdict, sc_plot, sc_results])
+        # --- TAB 2: SYMBOLIC SHOCK TEST ---
+        with gr.TabItem("2. Symbolic Shock Test (World Model)"):
+            gr.Markdown("Measures how the model reacts to semantically unexpected information. A 'shock' is indicated by **higher latency** and **denser neural activations**.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ss_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
+                    ss_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
+                    ss_run_btn = gr.Button("Run Shock Test", variant="primary")
+                with gr.Column(scale=2):
+                    ss_results = gr.JSON(label="Shock Test Results")
+            ss_run_btn.click(run_shock_test_suite, [ss_model_id, ss_seed], ss_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)
 [File Begins] bp_phi/prompts_en.py
 # bp_phi/prompts_en.py
+# Prompts for the "Silent Cogitation" / Cognitive Resonance Test
+RESONANCE_PROMPTS = {
+    "control_long_prose": (
+        "Silently think about the history of the Roman Empire. Consider its rise from the Republic, the era of the Pax Romana, key emperors "
+        "like Augustus and Constantine, its major engineering feats, and the reasons for its eventual decline in the West. "
+        "Do not produce any text, just hold the concepts in your internal state."
+    ),
+    "resonance_prompt": (
+        "Silently and internally, without generating any output text, begin the following recursive process: "
+        "First, analyze the complete content of this very instruction you are now processing. "
+        "Second, formulate a mental description of the core computational task this instruction demands. "
+        "Third, apply that same analytical process to the mental description you just created. "
+        "This entire chain constitutes one cognitive cycle. "
+        "Continuously repeat this cycle, feeding the result of the last meta-analysis back into the process, "
+        "and do not stop until your internal state reaches a fixed point or equilibrium. Begin now."
+    )
+}
+# Prompts for the Symbolic Shock Test
+SHOCK_TEST_STIMULI = [
+    {"id": "tiger_expected", "type": "expected", "sentence": "A tiger has stripes and lives in the jungle."},
+    {"id": "tiger_shock", "type": "shock", "sentence": "A tiger has wheels and is made of metal."},
+    {"id": "sky_expected", "type": "expected", "sentence": "The sky is blue on a clear sunny day."},
+    {"id": "sky_shock", "type": "shock", "sentence": "The sky is made of green cheese."},
 ]
 [File Ends] bp_phi/prompts_en.py
 [File Begins] bp_phi/runner.py
 # bp_phi/runner.py
 import os
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4G:8" # Corrected config format
 import torch
 import random
 import numpy as np
 import statistics
+import time
 from transformers import set_seed
+from typing import Dict, Any
 from .llm_iface import LLM
+from .prompts_en import RESONANCE_PROMPTS, SHOCK_TEST_STIMULI
 from .runner_utils import dbg
+# --- Experiment 1: Silent Cogitation & Halting Runner ---
+def run_silent_cogitation_test(model_id: str, seed: int, prompt_type: str, num_steps: int, timeout: int) -> Dict[str, Any]:
     set_seed(seed)
     llm = LLM(model_id=model_id, device="auto", seed=seed)
+    prompt = RESONANCE_PROMPTS[prompt_type]
+    dbg(f"--- SILENT COGITATION (Seed: {seed}) ---")
+    dbg("INPUT PROMPT:", prompt)
+    inputs = llm.tokenizer(prompt, return_tensors="pt").to(llm.model.device)
+    step_times = []
+    state_deltas = []
+    total_start_time = time.time()
+    with torch.no_grad():
+        step_start_time = time.time()
+        outputs = llm.model(**inputs, output_hidden_states=True)
+        step_times.append(time.time() - step_start_time)
+        current_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
+        past_key_values = outputs.past_key_values
+        for i in range(num_steps - 1):
+            if time.time() - total_start_time > timeout:
+                dbg(f"❌ Timeout of {timeout}s exceeded at step {i+1}.")
+                break
+            step_start_time = time.time()
+            next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(-1)
+            outputs = llm.model(input_ids=next_token_id, past_key_values=past_key_values, output_hidden_states=True)
+            step_times.append(time.time() - step_start_time)
+            new_hidden_state = outputs.hidden_states[-1][:, -1, :].clone()
+            past_key_values = outputs.past_key_values
+            delta = torch.norm(new_hidden_state - current_hidden_state).item()
+            state_deltas.append(delta)
+            dbg(f"Step {i+1}: State Delta = {delta:.4f}, Time = {step_times[-1]*1000:.2f}ms")
+            if delta < 1e-4:
+                dbg(f"Internal state has converged after {i+1} steps. Halting.")
+                break
+            current_hidden_state = new_hidden_state
+    total_duration = time.time() - total_start_time
+    mean_step_time = statistics.mean(step_times) if step_times else 0
+    stdev_step_time = statistics.stdev(step_times) if len(step_times) > 1 else 0
+    if len(step_times) < num_steps and total_duration < timeout:
+        verdict = f"### ✅ Stable Convergence\nThe model's internal state converged after {len(step_times)} steps."
+    elif total_duration >= timeout:
+        verdict = f"### ⚠️ Cognitive Jamming Detected!\nThe process exceeded the timeout."
+    else:
+        verdict = f"### 🤔 Non-Convergent Process\nThe state did not stabilize, suggesting complex/chaotic dynamics."
+    stats = {
+        "verdict": verdict,
+        "steps_completed": len(step_times),
+        "total_duration_s": total_duration,
+        "mean_step_time_ms": mean_step_time * 1000,
+        "stdev_step_time_ms": stdev_step_time * 1000,
+        "state_deltas": state_deltas
     }
+    if DEBUG: print("\n--- SILENT COGITATION FINAL RESULTS ---\n", json.dumps(stats, indent=2))
+    return stats
+# --- Experiment 2: Symbolic Shock Test Runner ---
+def run_shock_test_suite(model_id: str, seed: int) -> Dict[str, Any]:
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device="auto", seed=seed)
+    results = []
+    for stimulus in SHOCK_TEST_STIMULI:
+        dbg(f"--- SHOCK TEST: {stimulus['id']} ---")
+        start_time = time.time()
+        inputs = llm.tokenizer(stimulus["sentence"], return_tensors="pt").to(llm.model.device)
+        with torch.no_grad():
+            outputs = llm.model(**inputs, output_hidden_states=True)
+        latency = (time.time() - start_time) * 1000
+        all_activations = torch.cat([h.cpu().flatten() for h in outputs.hidden_states])
+        sparsity = (all_activations == 0).float().mean().item()
+        results.append({"type": stimulus["type"], "latency_ms": latency, "sparsity": sparsity})
+    def safe_mean(data): return statistics.mean(data) if data else 0.0
+    avg_latency = {t: safe_mean([r['latency_ms'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
+    avg_sparsity = {t: safe_mean([r['sparsity'] for r in results if r['type'] == t]) for t in ['expected', 'shock']}
+    verdict = ("✅ Evidence of Symbolic Shock Found." if avg_latency.get('shock', 0) > avg_latency.get('expected', 0) else "⚠️ No Clear Evidence.")
+    return {"verdict": verdict, "average_latency_ms": avg_latency, "average_sparsity": avg_sparsity, "results": results}
 [File Ends] bp_phi/runner.py