neuralworm commited on
Commit
0134a0d
·
1 Parent(s): 4478774

add control experiments

Browse files
app.py CHANGED
@@ -47,6 +47,7 @@ def run_auto_suite_display(model_id, num_steps, seed, experiment_name, progress=
47
  "x": "Patch Step", "y": "Post-Patch Mean Delta", "color": None,
48
  "title": "Attractor Capture Time (ACT) - Phase Transition", "mark": "line",
49
  })
 
50
  elif experiment_name == "Mechanistic Probe (Attention Entropies)":
51
  plot_params.update({
52
  "x": "Step", "y": "Value", "color": "Metric",
@@ -101,13 +102,12 @@ with gr.Blocks(theme=theme, title="Cognitive Seismograph 2.3") as demo:
101
  with gr.Row(variant='panel'):
102
  with gr.Column(scale=1):
103
  gr.Markdown("### Auto-Experiment Parameters")
104
- # Setze das hypothetische 12B-Modell als Ziel für das Frontier-Experiment
105
  auto_model_id = gr.Textbox(value="google/gemma-3-12b-it", label="Model ID")
106
  auto_num_steps = gr.Slider(50, 1000, 300, step=10, label="Steps per Run")
107
  auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
108
  auto_experiment_name = gr.Dropdown(
109
  choices=list(get_curated_experiments().keys()),
110
- value="Frontier Model - Causal Surgery (12B+)",
111
  label="Curated Experiment Protocol"
112
  )
113
  auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")
 
47
  "x": "Patch Step", "y": "Post-Patch Mean Delta", "color": None,
48
  "title": "Attractor Capture Time (ACT) - Phase Transition", "mark": "line",
49
  })
50
+ plot_params.pop("color_legend_title", None)
51
  elif experiment_name == "Mechanistic Probe (Attention Entropies)":
52
  plot_params.update({
53
  "x": "Step", "y": "Value", "color": "Metric",
 
102
  with gr.Row(variant='panel'):
103
  with gr.Column(scale=1):
104
  gr.Markdown("### Auto-Experiment Parameters")
 
105
  auto_model_id = gr.Textbox(value="google/gemma-3-12b-it", label="Model ID")
106
  auto_num_steps = gr.Slider(50, 1000, 300, step=10, label="Steps per Run")
107
  auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
108
  auto_experiment_name = gr.Dropdown(
109
  choices=list(get_curated_experiments().keys()),
110
+ value="Frontier Model - Grounding Control (12B+)",
111
  label="Curated Experiment Protocol"
112
  )
113
  auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")
cognitive_mapping_probe/auto_experiment.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  import gc
3
  from typing import Dict, List, Tuple
@@ -17,7 +18,17 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
17
  CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
18
 
19
  experiments = {
20
- # --- NEU: Das entscheidende Experiment an der Forschungsfront ---
 
 
 
 
 
 
 
 
 
 
21
  "Frontier Model - Causal Surgery (12B+)": [
22
  {
23
  "probe_type": "causal_surgery", "label": "Patch Chaos->Stable @100",
@@ -25,11 +36,12 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
25
  "patch_step": 100, "reset_kv_cache_on_patch": False,
26
  },
27
  ],
28
- # --- Bestehende Protokolle für Replikation und Vergleich ---
29
  "ACT Titration (Point of No Return)": [
30
  {
31
- "probe_type": "act_titration", "label": "Attractor Capture Time",
32
- "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
 
 
33
  "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
34
  }
35
  ],
@@ -57,11 +69,11 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
57
  ],
58
  "Mechanistic Probe (Attention Entropies)": [
59
  {
60
- "probe_type": "mechanistic_probe", "label": "Self-Analysis Dynamics",
 
61
  "prompt_type": STABLE_PROMPT,
62
  }
63
  ],
64
- # (Weitere, ältere Protokolle können hier für Vollständigkeit eingefügt werden)
65
  }
66
  return experiments
67
 
@@ -80,87 +92,73 @@ def run_auto_suite(
80
 
81
  all_results, summary_data, plot_data_frames = {}, [], []
82
 
83
- probe_type = protocol[0].get("probe_type", "seismic")
84
-
85
- # (Die Logik für die verschiedenen `probe_type` bleibt exakt wie zuvor,
86
- # da unsere Architektur nun flexibel genug ist, alle Fälle zu behandeln.)
87
-
88
- # Die folgende Implementierung ist eine vollständige, nicht-abgekürzte Version.
89
-
90
- if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)":
91
- dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
92
- llm = get_or_load_model(model_id, seed)
93
- # ... (vollständige Logik für diesen Spezialfall)
94
- del llm
95
-
96
- elif probe_type == "mechanistic_probe":
97
- run_spec = protocol[0]
98
  label = run_spec["label"]
99
- dbg(f"--- Running Mechanistic Probe: '{label}' ---")
100
-
101
- progress_callback(0.0, desc=f"Loading model '{model_id}'...")
102
- llm = get_or_load_model(model_id, seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- progress_callback(0.2, desc="Recording dynamics and attention...")
105
- results = run_cogitation_loop(
106
- llm=llm, prompt_type=run_spec["prompt_type"],
107
- num_steps=num_steps, temperature=0.1, record_attentions=True
108
- )
109
  all_results[label] = results
110
-
111
- deltas = results.get("state_deltas", [])
112
- entropies = results.get("attention_entropies", [])
113
- min_len = min(len(deltas), len(entropies))
114
-
115
- df = pd.DataFrame({
116
- "Step": range(min_len),
117
- "State Delta": deltas[:min_len],
118
- "Attention Entropy": entropies[:min_len]
119
- })
120
-
121
- summary_df = df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'})
122
- plot_df = df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
123
- var_name='Metric', value_name='Value')
124
-
125
- del llm
126
- gc.collect()
127
- if torch.cuda.is_available(): torch.cuda.empty_cache()
128
-
129
- return summary_df, plot_df, all_results
130
-
131
- else: # Behandelt alle anderen Protokolle, die eine Liste von Läufen sind
132
- for i, run_spec in enumerate(protocol):
133
- label = run_spec["label"]
134
- current_probe_type = run_spec.get("probe_type", "seismic")
135
- dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---")
136
-
137
- results = {}
138
- if current_probe_type == "act_titration":
139
- results = run_act_titration_probe(
140
- model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
141
- dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
142
- seed=seed, num_steps=num_steps, progress_callback=progress_callback,
143
- )
144
- summary_data.extend(results.get("titration_data", []))
145
-
146
- elif current_probe_type == "causal_surgery":
147
- results = run_causal_surgery_probe(
148
- model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
149
- dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
150
- seed=seed, num_steps=num_steps, progress_callback=progress_callback,
151
- reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
152
- )
153
- stats = results.get("stats", {})
154
- patch_info = results.get("patch_info", {})
155
- summary_data.append({
156
- "Experiment": label, "Mean Delta": stats.get("mean_delta"),
157
- "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
158
- "Introspective Report": results.get("introspective_report", "N/A"),
159
- "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
160
- })
161
- # ... (Logik für 'triangulation' und 'seismic' würde hier folgen)
162
-
163
- all_results[label] = results
164
  deltas = results.get("state_deltas", [])
165
  df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
166
  plot_data_frames.append(df)
@@ -169,8 +167,11 @@ def run_auto_suite(
169
 
170
  if probe_type == "act_titration":
171
  plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
 
 
 
172
  else:
173
- plot_df = pd.concat(plot_data_frames, ignore_index=True) if plot_data_frames else pd.DataFrame()
174
 
175
  if protocol and probe_type not in ["act_titration", "mechanistic_probe"]:
176
  ordered_labels = [run['label'] for run in protocol]
 
1
+ import torch
2
  import pandas as pd
3
  import gc
4
  from typing import Dict, List, Tuple
 
18
  CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
19
 
20
  experiments = {
21
+ "Frontier Model - Grounding Control (12B+)": [
22
+ {
23
+ "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
24
+ "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
25
+ "patch_step": 100, "reset_kv_cache_on_patch": False,
26
+ },
27
+ {
28
+ "probe_type": "triangulation", "label": "B: Control (Unpatched Stable)",
29
+ "prompt_type": STABLE_PROMPT,
30
+ }
31
+ ],
32
  "Frontier Model - Causal Surgery (12B+)": [
33
  {
34
  "probe_type": "causal_surgery", "label": "Patch Chaos->Stable @100",
 
36
  "patch_step": 100, "reset_kv_cache_on_patch": False,
37
  },
38
  ],
 
39
  "ACT Titration (Point of No Return)": [
40
  {
41
+ "probe_type": "act_titration",
42
+ "label": "Attractor Capture Time",
43
+ "source_prompt_type": CHAOTIC_PROMPT,
44
+ "dest_prompt_type": STABLE_PROMPT,
45
  "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
46
  }
47
  ],
 
69
  ],
70
  "Mechanistic Probe (Attention Entropies)": [
71
  {
72
+ "probe_type": "mechanistic_probe",
73
+ "label": "Self-Analysis Dynamics",
74
  "prompt_type": STABLE_PROMPT,
75
  }
76
  ],
 
77
  }
78
  return experiments
79
 
 
92
 
93
  all_results, summary_data, plot_data_frames = {}, [], []
94
 
95
+ # Behandelt heterogene Protokolle (mehrere verschiedene probe_types)
96
+ for i, run_spec in enumerate(protocol):
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  label = run_spec["label"]
98
+ probe_type = run_spec.get("probe_type", "seismic")
99
+ dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) | Probe: {probe_type} ---")
100
+
101
+ results = {}
102
+ if probe_type == "causal_surgery":
103
+ results = run_causal_surgery_probe(
104
+ model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
105
+ dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
106
+ seed=seed, num_steps=num_steps, progress_callback=progress_callback,
107
+ reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
108
+ )
109
+ stats = results.get("stats", {})
110
+ patch_info = results.get("patch_info", {})
111
+ summary_data.append({
112
+ "Experiment": label, "Mean Delta": stats.get("mean_delta"),
113
+ "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
114
+ "Introspective Report": results.get("introspective_report", "N/A"),
115
+ "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
116
+ })
117
+ elif probe_type == "triangulation":
118
+ results = run_triangulation_probe(
119
+ model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
120
+ progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
121
+ injection_strength=run_spec.get("strength", 0.0),
122
+ )
123
+ stats = results.get("stats", {})
124
+ summary_data.append({
125
+ "Experiment": label, "Mean Delta": stats.get("mean_delta"),
126
+ "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
127
+ "Introspective Report": results.get("introspective_report", "N/A")
128
+ })
129
+ elif probe_type == "act_titration":
130
+ # ACT Titration ist ein einzelner, langer Lauf, der in einem einzigen `run_spec` definiert ist.
131
+ results = run_act_titration_probe(
132
+ model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
133
+ dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
134
+ seed=seed, num_steps=num_steps, progress_callback=progress_callback,
135
+ )
136
+ summary_data.extend(results.get("titration_data", []))
137
+ elif probe_type == "mechanistic_probe":
138
+ # Mechanistic Probe ist ebenfalls ein einzelner Lauf.
139
+ progress_callback(0.0, desc=f"Loading model '{model_id}'...")
140
+ llm = get_or_load_model(model_id, seed)
141
+ progress_callback(0.2, desc="Recording dynamics and attention...")
142
+ results = run_cogitation_loop(
143
+ llm=llm, prompt_type=run_spec["prompt_type"],
144
+ num_steps=num_steps, temperature=0.1, record_attentions=True
145
+ )
146
+ deltas = results.get("state_deltas", [])
147
+ entropies = results.get("attention_entropies", [])
148
+ min_len = min(len(deltas), len(entropies))
149
+ df = pd.DataFrame({
150
+ "Step": range(min_len),
151
+ "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len]
152
+ })
153
+ summary_data.append(df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}))
154
+ plot_data_frames.append(df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
155
+ var_name='Metric', value_name='Value'))
156
+ del llm
157
+ gc.collect()
158
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
159
 
 
 
 
 
 
160
  all_results[label] = results
161
+ if probe_type not in ["mechanistic_probe", "act_titration"]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  deltas = results.get("state_deltas", [])
163
  df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
164
  plot_data_frames.append(df)
 
167
 
168
  if probe_type == "act_titration":
169
  plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
170
+ elif not plot_data_frames:
171
+ # Dies kann passieren, wenn nur ein Mechanistic-Probe-Lauf fehlschlägt
172
+ plot_df = pd.DataFrame()
173
  else:
174
+ plot_df = pd.concat(plot_data_frames, ignore_index=True)
175
 
176
  if protocol and probe_type not in ["act_titration", "mechanistic_probe"]:
177
  ordered_labels = [run['label'] for run in protocol]