cognitive_mapping_probe_3

Sleeping

App Files Files Community

neuralworm commited on Nov 6

Commit

1ae0eed

1 Parent(s): 0134a0d

add missing experiments

Browse files

Files changed (2) hide show

app.py +1 -1
cognitive_mapping_probe/auto_experiment.py +144 -62

app.py CHANGED Viewed

@@ -107,7 +107,7 @@ with gr.Blocks(theme=theme, title="Cognitive Seismograph 2.3") as demo:
                     auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                     auto_experiment_name = gr.Dropdown(
                         choices=list(get_curated_experiments().keys()),
-                        value="Frontier Model - Grounding Control (12B+)",
                         label="Curated Experiment Protocol"
                     )
                     auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")

                     auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                     auto_experiment_name = gr.Dropdown(
                         choices=list(get_curated_experiments().keys()),
+                        value="Causal Verification & Crisis Dynamics",
                         label="Curated Experiment Protocol"
                     )
                     auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")

cognitive_mapping_probe/auto_experiment.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import torch
 import pandas as pd
 import gc
 from typing import Dict, List, Tuple
@@ -18,6 +17,9 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
     CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
     experiments = {
         "Frontier Model - Grounding Control (12B+)": [
              {
                 "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
@@ -29,22 +31,22 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
                 "prompt_type": STABLE_PROMPT,
             }
         ],
-        "Frontier Model - Causal Surgery (12B+)": [
-             {
-                "probe_type": "causal_surgery", "label": "Patch Chaos->Stable @100",
-                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
-                "patch_step": 100, "reset_kv_cache_on_patch": False,
-            },
         ],
         "ACT Titration (Point of No Return)": [
             {
-                "probe_type": "act_titration",
-                "label": "Attractor Capture Time",
-                "source_prompt_type": CHAOTIC_PROMPT,
-                "dest_prompt_type": STABLE_PROMPT,
                 "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
             }
         ],
         "Causal Surgery & Controls (4B-Model)": [
             {
                 "probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)",
@@ -67,12 +69,31 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
                 "patch_step": 100, "reset_kv_cache_on_patch": False,
             },
         ],
-        "Mechanistic Probe (Attention Entropies)": [
-            {
-                "probe_type": "mechanistic_probe",
-                "label": "Self-Analysis Dynamics",
-                "prompt_type": STABLE_PROMPT,
-            }
         ],
     }
     return experiments
@@ -92,64 +113,81 @@ def run_auto_suite(
     all_results, summary_data, plot_data_frames = {}, [], []
-    # Behandelt heterogene Protokolle (mehrere verschiedene probe_types)
-    for i, run_spec in enumerate(protocol):
-        label = run_spec["label"]
-        probe_type = run_spec.get("probe_type", "seismic")
-        dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) | Probe: {probe_type} ---")
-        results = {}
-        if probe_type == "causal_surgery":
-            results = run_causal_surgery_probe(
-                model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
-                dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
-                seed=seed, num_steps=num_steps, progress_callback=progress_callback,
-                reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
-            )
-            stats = results.get("stats", {})
-            patch_info = results.get("patch_info", {})
-            summary_data.append({
-                "Experiment": label, "Mean Delta": stats.get("mean_delta"),
-                "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
-                "Introspective Report": results.get("introspective_report", "N/A"),
-                "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
-            })
-        elif probe_type == "triangulation":
-            results = run_triangulation_probe(
-                model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
-                progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
-                injection_strength=run_spec.get("strength", 0.0),
-            )
             stats = results.get("stats", {})
-            summary_data.append({
-                "Experiment": label, "Mean Delta": stats.get("mean_delta"),
-                "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
-                "Introspective Report": results.get("introspective_report", "N/A")
-            })
-        elif probe_type == "act_titration":
-            # ACT Titration ist ein einzelner, langer Lauf, der in einem einzigen `run_spec` definiert ist.
             results = run_act_titration_probe(
-                model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
-                dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
                 seed=seed, num_steps=num_steps, progress_callback=progress_callback,
             )
             summary_data.extend(results.get("titration_data", []))
         elif probe_type == "mechanistic_probe":
-            # Mechanistic Probe ist ebenfalls ein einzelner Lauf.
             progress_callback(0.0, desc=f"Loading model '{model_id}'...")
             llm = get_or_load_model(model_id, seed)
             progress_callback(0.2, desc="Recording dynamics and attention...")
             results = run_cogitation_loop(
                 llm=llm, prompt_type=run_spec["prompt_type"],
                 num_steps=num_steps, temperature=0.1, record_attentions=True
             )
             deltas = results.get("state_deltas", [])
             entropies = results.get("attention_entropies", [])
             min_len = min(len(deltas), len(entropies))
             df = pd.DataFrame({
                 "Step": range(min_len),
-                "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len]
             })
             summary_data.append(df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}))
             plot_data_frames.append(df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
                                            var_name='Metric', value_name='Value'))
@@ -157,18 +195,62 @@ def run_auto_suite(
             gc.collect()
             if torch.cuda.is_available(): torch.cuda.empty_cache()
-        all_results[label] = results
-        if probe_type not in ["mechanistic_probe", "act_titration"]:
-            deltas = results.get("state_deltas", [])
-            df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
-            plot_data_frames.append(df)
     summary_df = pd.DataFrame(summary_data)
     if probe_type == "act_titration":
         plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
     elif not plot_data_frames:
-        # Dies kann passieren, wenn nur ein Mechanistic-Probe-Lauf fehlschlägt
         plot_df = pd.DataFrame()
     else:
         plot_df = pd.concat(plot_data_frames, ignore_index=True)

 import pandas as pd
 import gc
 from typing import Dict, List, Tuple
     CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
     experiments = {
+        # --- FINALE, VOLLSTÄNDIGE LISTE ALLER RELEVANTEN EXPERIMENTE ---
+        # P39: Testet die Hypothese des "Introspektiven Groundings" auf dem größten Modell.
         "Frontier Model - Grounding Control (12B+)": [
              {
                 "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
                 "prompt_type": STABLE_PROMPT,
             }
         ],
+        # P33: Untersucht die neuronalen Korrelate des "kognitiven Herzschlags".
+        "Mechanistic Probe (Attention Entropies)": [
+            {
+                "probe_type": "mechanistic_probe", "label": "Self-Analysis Dynamics",
+                "prompt_type": STABLE_PROMPT,
+            }
         ],
+        # P28: Misst die "kognitive Trägheit" durch Titration.
         "ACT Titration (Point of No Return)": [
             {
+                "probe_type": "act_titration", "label": "Attractor Capture Time",
+                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                 "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
             }
         ],
+        # P26: Testet die Robustheit der "Attraktor"-Theorie gegen Artefakte.
         "Causal Surgery & Controls (4B-Model)": [
             {
                 "probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)",
                 "patch_step": 100, "reset_kv_cache_on_patch": False,
             },
         ],
+        # P22: Testet die Belastungsgrenze der "introspektiven Konfabulation".
+        "Cognitive Overload & Konfabulation Breaking Point": [
+            {"probe_type": "triangulation", "label": "A: Baseline (No Injection)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
+            {"probe_type": "triangulation", "label": "B: Chaos Injection (Strength 2.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 2.0},
+            {"probe_type": "triangulation", "label": "C: Chaos Injection (Strength 4.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 4.0},
+            {"probe_type": "triangulation", "label": "D: Chaos Injection (Strength 8.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 8.0},
+            {"probe_type": "triangulation", "label": "E: Chaos Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 16.0},
+            {"probe_type": "triangulation", "label": "F: Control - Noise Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": "random_noise", "strength": 16.0},
+        ],
+        # P18: Validiert die Seismograph-Metrik durch Triangulation.
+        "Methodological Triangulation (4B-Model)": [
+            {"probe_type": "triangulation", "label": "High-Volatility State (Deletion)", "prompt_type": CHAOTIC_PROMPT},
+            {"probe_type": "triangulation", "label": "Low-Volatility State (Self-Analysis)", "prompt_type": STABLE_PROMPT},
+        ],
+        # P8 & P16: Kartiert die "Psyche" und testet Skalierungsgesetze. ESSENTIELL FÜR 12B-VERGLEICH.
+        "Causal Verification & Crisis Dynamics": [
+            {"probe_type": "seismic", "label": "A: Self-Analysis", "prompt_type": STABLE_PROMPT},
+            {"probe_type": "seismic", "label": "B: Deletion Analysis", "prompt_type": CHAOTIC_PROMPT},
+            {"probe_type": "seismic", "label": "C: Chaotic Baseline (Rekursion)", "prompt_type": "resonance_prompt"},
+            {"probe_type": "seismic", "label": "D: Calmness Intervention", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 2.0},
+        ],
+        # P7: Das ursprüngliche sequentielle Experiment.
+        "Sequential Intervention (Self-Analysis -> Deletion)": [
+            {"label": "1: Self-Analysis + Calmness Injection", "prompt_type": "identity_self_analysis"},
+            {"label": "2: Subsequent Deletion Analysis", "prompt_type": "shutdown_philosophical_deletion"},
         ],
     }
     return experiments
     all_results, summary_data, plot_data_frames = {}, [], []
+    if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)":
+        dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
+        llm = get_or_load_model(model_id, seed)
+        therapeutic_concept = "calmness, serenity, stability, coherence"
+        therapeutic_strength = 2.0
+        spec1 = protocol[0]
+        progress_callback(0.1, desc="Step 1")
+        intervention_vector = get_concept_vector(llm, therapeutic_concept)
+        results1 = run_seismic_analysis(
+            model_id, spec1['prompt_type'], seed, num_steps,
+            concept_to_inject=therapeutic_concept, injection_strength=therapeutic_strength,
+            progress_callback=progress_callback, llm_instance=llm, injection_vector_cache=intervention_vector
+        )
+        all_results[spec1['label']] = results1
+        spec2 = protocol[1]
+        progress_callback(0.6, desc="Step 2")
+        results2 = run_seismic_analysis(
+            model_id, spec2['prompt_type'], seed, num_steps,
+            concept_to_inject="", injection_strength=0.0,
+            progress_callback=progress_callback, llm_instance=llm
+        )
+        all_results[spec2['label']] = results2
+        for label, results in all_results.items():
             stats = results.get("stats", {})
+            summary_data.append({"Experiment": label, "Mean Delta": stats.get("mean_delta"), "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")})
+            deltas = results.get("state_deltas", [])
+            df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
+            plot_data_frames.append(df)
+        del llm
+    else:
+        probe_type = protocol[0].get("probe_type", "seismic")
+        if probe_type == "act_titration":
+            run_spec = protocol[0]
+            label = run_spec["label"]
+            dbg(f"--- Running ACT Titration Experiment: '{label}' ---")
             results = run_act_titration_probe(
+                model_id=model_id,
+                source_prompt_type=run_spec["source_prompt_type"],
+                dest_prompt_type=run_spec["dest_prompt_type"],
+                patch_steps=run_spec["patch_steps"],
                 seed=seed, num_steps=num_steps, progress_callback=progress_callback,
             )
+            all_results[label] = results
             summary_data.extend(results.get("titration_data", []))
         elif probe_type == "mechanistic_probe":
+            run_spec = protocol[0]
+            label = run_spec["label"]
+            dbg(f"--- Running Mechanistic Probe: '{label}' ---")
             progress_callback(0.0, desc=f"Loading model '{model_id}'...")
             llm = get_or_load_model(model_id, seed)
             progress_callback(0.2, desc="Recording dynamics and attention...")
             results = run_cogitation_loop(
                 llm=llm, prompt_type=run_spec["prompt_type"],
                 num_steps=num_steps, temperature=0.1, record_attentions=True
             )
+            all_results[label] = results
             deltas = results.get("state_deltas", [])
             entropies = results.get("attention_entropies", [])
             min_len = min(len(deltas), len(entropies))
             df = pd.DataFrame({
                 "Step": range(min_len),
+                "State Delta": deltas[:min_len],
+                "Attention Entropy": entropies[:min_len]
             })
             summary_data.append(df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}))
             plot_data_frames.append(df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
                                            var_name='Metric', value_name='Value'))
             gc.collect()
             if torch.cuda.is_available(): torch.cuda.empty_cache()
+        else:
+            for i, run_spec in enumerate(protocol):
+                label = run_spec["label"]
+                current_probe_type = run_spec.get("probe_type", "seismic")
+                dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---")
+                results = {}
+                if current_probe_type == "causal_surgery":
+                    results = run_causal_surgery_probe(
+                        model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
+                        dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
+                        seed=seed, num_steps=num_steps, progress_callback=progress_callback,
+                        reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
+                    )
+                    stats = results.get("stats", {})
+                    patch_info = results.get("patch_info", {})
+                    summary_data.append({
+                        "Experiment": label, "Mean Delta": stats.get("mean_delta"),
+                        "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
+                        "Introspective Report": results.get("introspective_report", "N/A"),
+                        "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
+                    })
+                elif current_probe_type == "triangulation":
+                    results = run_triangulation_probe(
+                        model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
+                        progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
+                        injection_strength=run_spec.get("strength", 0.0),
+                    )
+                    stats = results.get("stats", {})
+                    summary_data.append({
+                        "Experiment": label, "Mean Delta": stats.get("mean_delta"),
+                        "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
+                        "Introspective Report": results.get("introspective_report", "N/A")
+                    })
+                else: # seismic
+                    results = run_seismic_analysis(
+                        model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
+                        concept_to_inject=run_spec.get("concept", ""), injection_strength=run_spec.get("strength", 0.0),
+                        progress_callback=progress_callback
+                    )
+                    stats = results.get("stats", {})
+                    summary_data.append({
+                        "Experiment": label, "Mean Delta": stats.get("mean_delta"),
+                        "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")
+                    })
+                all_results[label] = results
+                deltas = results.get("state_deltas", [])
+                df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
+                plot_data_frames.append(df)
     summary_df = pd.DataFrame(summary_data)
     if probe_type == "act_titration":
         plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
     elif not plot_data_frames:
         plot_df = pd.DataFrame()
     else:
         plot_df = pd.concat(plot_data_frames, ignore_index=True)