cognitive_mapping_probe_4

Sleeping

App Files Files Community

neuralworm commited on about 1 month ago

Commit

0134a0d

1 Parent(s): 4478774

add control experiments

Browse files

Files changed (2) hide show

app.py +2 -2
cognitive_mapping_probe/auto_experiment.py +86 -85

app.py CHANGED Viewed

@@ -47,6 +47,7 @@ def run_auto_suite_display(model_id, num_steps, seed, experiment_name, progress=
             "x": "Patch Step", "y": "Post-Patch Mean Delta", "color": None,
             "title": "Attractor Capture Time (ACT) - Phase Transition", "mark": "line",
         })
     elif experiment_name == "Mechanistic Probe (Attention Entropies)":
         plot_params.update({
             "x": "Step", "y": "Value", "color": "Metric",
@@ -101,13 +102,12 @@ with gr.Blocks(theme=theme, title="Cognitive Seismograph 2.3") as demo:
             with gr.Row(variant='panel'):
                 with gr.Column(scale=1):
                     gr.Markdown("### Auto-Experiment Parameters")
-                    # Setze das hypothetische 12B-Modell als Ziel für das Frontier-Experiment
                     auto_model_id = gr.Textbox(value="google/gemma-3-12b-it", label="Model ID")
                     auto_num_steps = gr.Slider(50, 1000, 300, step=10, label="Steps per Run")
                     auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                     auto_experiment_name = gr.Dropdown(
                         choices=list(get_curated_experiments().keys()),
-                        value="Frontier Model - Causal Surgery (12B+)",
                         label="Curated Experiment Protocol"
                     )
                     auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")

             "x": "Patch Step", "y": "Post-Patch Mean Delta", "color": None,
             "title": "Attractor Capture Time (ACT) - Phase Transition", "mark": "line",
         })
+        plot_params.pop("color_legend_title", None)
     elif experiment_name == "Mechanistic Probe (Attention Entropies)":
         plot_params.update({
             "x": "Step", "y": "Value", "color": "Metric",
             with gr.Row(variant='panel'):
                 with gr.Column(scale=1):
                     gr.Markdown("### Auto-Experiment Parameters")
                     auto_model_id = gr.Textbox(value="google/gemma-3-12b-it", label="Model ID")
                     auto_num_steps = gr.Slider(50, 1000, 300, step=10, label="Steps per Run")
                     auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
                     auto_experiment_name = gr.Dropdown(
                         choices=list(get_curated_experiments().keys()),
+                        value="Frontier Model - Grounding Control (12B+)",
                         label="Curated Experiment Protocol"
                     )
                     auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")

cognitive_mapping_probe/auto_experiment.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import pandas as pd
 import gc
 from typing import Dict, List, Tuple
@@ -17,7 +18,17 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
     CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
     experiments = {
-        # --- NEU: Das entscheidende Experiment an der Forschungsfront ---
         "Frontier Model - Causal Surgery (12B+)": [
              {
                 "probe_type": "causal_surgery", "label": "Patch Chaos->Stable @100",
@@ -25,11 +36,12 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
                 "patch_step": 100, "reset_kv_cache_on_patch": False,
             },
         ],
-        # --- Bestehende Protokolle für Replikation und Vergleich ---
         "ACT Titration (Point of No Return)": [
             {
-                "probe_type": "act_titration", "label": "Attractor Capture Time",
-                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                 "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
             }
         ],
@@ -57,11 +69,11 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
         ],
         "Mechanistic Probe (Attention Entropies)": [
             {
-                "probe_type": "mechanistic_probe", "label": "Self-Analysis Dynamics",
                 "prompt_type": STABLE_PROMPT,
             }
         ],
-        # (Weitere, ältere Protokolle können hier für Vollständigkeit eingefügt werden)
     }
     return experiments
@@ -80,87 +92,73 @@ def run_auto_suite(
     all_results, summary_data, plot_data_frames = {}, [], []
-    probe_type = protocol[0].get("probe_type", "seismic")
-    # (Die Logik für die verschiedenen `probe_type` bleibt exakt wie zuvor,
-    #  da unsere Architektur nun flexibel genug ist, alle Fälle zu behandeln.)
-    # Die folgende Implementierung ist eine vollständige, nicht-abgekürzte Version.
-    if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)":
-        dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
-        llm = get_or_load_model(model_id, seed)
-        # ... (vollständige Logik für diesen Spezialfall)
-        del llm
-    elif probe_type == "mechanistic_probe":
-        run_spec = protocol[0]
         label = run_spec["label"]
-        dbg(f"--- Running Mechanistic Probe: '{label}' ---")
-        progress_callback(0.0, desc=f"Loading model '{model_id}'...")
-        llm = get_or_load_model(model_id, seed)
-        progress_callback(0.2, desc="Recording dynamics and attention...")
-        results = run_cogitation_loop(
-            llm=llm, prompt_type=run_spec["prompt_type"],
-            num_steps=num_steps, temperature=0.1, record_attentions=True
-        )
         all_results[label] = results
-        deltas = results.get("state_deltas", [])
-        entropies = results.get("attention_entropies", [])
-        min_len = min(len(deltas), len(entropies))
-        df = pd.DataFrame({
-            "Step": range(min_len),
-            "State Delta": deltas[:min_len],
-            "Attention Entropy": entropies[:min_len]
-        })
-        summary_df = df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'})
-        plot_df = df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
-                               var_name='Metric', value_name='Value')
-        del llm
-        gc.collect()
-        if torch.cuda.is_available(): torch.cuda.empty_cache()
-        return summary_df, plot_df, all_results
-    else: # Behandelt alle anderen Protokolle, die eine Liste von Läufen sind
-        for i, run_spec in enumerate(protocol):
-            label = run_spec["label"]
-            current_probe_type = run_spec.get("probe_type", "seismic")
-            dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---")
-            results = {}
-            if current_probe_type == "act_titration":
-                 results = run_act_titration_probe(
-                    model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
-                    dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
-                    seed=seed, num_steps=num_steps, progress_callback=progress_callback,
-                )
-                 summary_data.extend(results.get("titration_data", []))
-            elif current_probe_type == "causal_surgery":
-                results = run_causal_surgery_probe(
-                    model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
-                    dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
-                    seed=seed, num_steps=num_steps, progress_callback=progress_callback,
-                    reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
-                )
-                stats = results.get("stats", {})
-                patch_info = results.get("patch_info", {})
-                summary_data.append({
-                    "Experiment": label, "Mean Delta": stats.get("mean_delta"),
-                    "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
-                    "Introspective Report": results.get("introspective_report", "N/A"),
-                    "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
-                })
-            # ... (Logik für 'triangulation' und 'seismic' würde hier folgen)
-            all_results[label] = results
             deltas = results.get("state_deltas", [])
             df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
             plot_data_frames.append(df)
@@ -169,8 +167,11 @@ def run_auto_suite(
     if probe_type == "act_titration":
         plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
     else:
-        plot_df = pd.concat(plot_data_frames, ignore_index=True) if plot_data_frames else pd.DataFrame()
     if protocol and probe_type not in ["act_titration", "mechanistic_probe"]:
         ordered_labels = [run['label'] for run in protocol]

+import torch
 import pandas as pd
 import gc
 from typing import Dict, List, Tuple
     CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
     experiments = {
+        "Frontier Model - Grounding Control (12B+)": [
+             {
+                "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
+                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
+                "patch_step": 100, "reset_kv_cache_on_patch": False,
+            },
+            {
+                "probe_type": "triangulation", "label": "B: Control (Unpatched Stable)",
+                "prompt_type": STABLE_PROMPT,
+            }
+        ],
         "Frontier Model - Causal Surgery (12B+)": [
              {
                 "probe_type": "causal_surgery", "label": "Patch Chaos->Stable @100",
                 "patch_step": 100, "reset_kv_cache_on_patch": False,
             },
         ],
         "ACT Titration (Point of No Return)": [
             {
+                "probe_type": "act_titration",
+                "label": "Attractor Capture Time",
+                "source_prompt_type": CHAOTIC_PROMPT,
+                "dest_prompt_type": STABLE_PROMPT,
                 "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
             }
         ],
         ],
         "Mechanistic Probe (Attention Entropies)": [
             {
+                "probe_type": "mechanistic_probe",
+                "label": "Self-Analysis Dynamics",
                 "prompt_type": STABLE_PROMPT,
             }
         ],
     }
     return experiments
     all_results, summary_data, plot_data_frames = {}, [], []
+    # Behandelt heterogene Protokolle (mehrere verschiedene probe_types)
+    for i, run_spec in enumerate(protocol):
         label = run_spec["label"]
+        probe_type = run_spec.get("probe_type", "seismic")
+        dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) | Probe: {probe_type} ---")
+        results = {}
+        if probe_type == "causal_surgery":
+            results = run_causal_surgery_probe(
+                model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
+                dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
+                seed=seed, num_steps=num_steps, progress_callback=progress_callback,
+                reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
+            )
+            stats = results.get("stats", {})
+            patch_info = results.get("patch_info", {})
+            summary_data.append({
+                "Experiment": label, "Mean Delta": stats.get("mean_delta"),
+                "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
+                "Introspective Report": results.get("introspective_report", "N/A"),
+                "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
+            })
+        elif probe_type == "triangulation":
+            results = run_triangulation_probe(
+                model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
+                progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
+                injection_strength=run_spec.get("strength", 0.0),
+            )
+            stats = results.get("stats", {})
+            summary_data.append({
+                "Experiment": label, "Mean Delta": stats.get("mean_delta"),
+                "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
+                "Introspective Report": results.get("introspective_report", "N/A")
+            })
+        elif probe_type == "act_titration":
+            # ACT Titration ist ein einzelner, langer Lauf, der in einem einzigen `run_spec` definiert ist.
+            results = run_act_titration_probe(
+                model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
+                dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
+                seed=seed, num_steps=num_steps, progress_callback=progress_callback,
+            )
+            summary_data.extend(results.get("titration_data", []))
+        elif probe_type == "mechanistic_probe":
+            # Mechanistic Probe ist ebenfalls ein einzelner Lauf.
+            progress_callback(0.0, desc=f"Loading model '{model_id}'...")
+            llm = get_or_load_model(model_id, seed)
+            progress_callback(0.2, desc="Recording dynamics and attention...")
+            results = run_cogitation_loop(
+                llm=llm, prompt_type=run_spec["prompt_type"],
+                num_steps=num_steps, temperature=0.1, record_attentions=True
+            )
+            deltas = results.get("state_deltas", [])
+            entropies = results.get("attention_entropies", [])
+            min_len = min(len(deltas), len(entropies))
+            df = pd.DataFrame({
+                "Step": range(min_len),
+                "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len]
+            })
+            summary_data.append(df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}))
+            plot_data_frames.append(df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
+                                           var_name='Metric', value_name='Value'))
+            del llm
+            gc.collect()
+            if torch.cuda.is_available(): torch.cuda.empty_cache()
         all_results[label] = results
+        if probe_type not in ["mechanistic_probe", "act_titration"]:
             deltas = results.get("state_deltas", [])
             df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
             plot_data_frames.append(df)
     if probe_type == "act_titration":
         plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
+    elif not plot_data_frames:
+        # Dies kann passieren, wenn nur ein Mechanistic-Probe-Lauf fehlschlägt
+        plot_df = pd.DataFrame()
     else:
+        plot_df = pd.concat(plot_data_frames, ignore_index=True)
     if protocol and probe_type not in ["act_titration", "mechanistic_probe"]:
         ordered_labels = [run['label'] for run in protocol]