neuralworm commited on
Commit
1ae0eed
·
1 Parent(s): 0134a0d

add missing experiments

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. cognitive_mapping_probe/auto_experiment.py +144 -62
app.py CHANGED
@@ -107,7 +107,7 @@ with gr.Blocks(theme=theme, title="Cognitive Seismograph 2.3") as demo:
107
  auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
108
  auto_experiment_name = gr.Dropdown(
109
  choices=list(get_curated_experiments().keys()),
110
- value="Frontier Model - Grounding Control (12B+)",
111
  label="Curated Experiment Protocol"
112
  )
113
  auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")
 
107
  auto_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
108
  auto_experiment_name = gr.Dropdown(
109
  choices=list(get_curated_experiments().keys()),
110
+ value="Causal Verification & Crisis Dynamics",
111
  label="Curated Experiment Protocol"
112
  )
113
  auto_run_btn = gr.Button("Run Curated Auto-Experiment", variant="primary")
cognitive_mapping_probe/auto_experiment.py CHANGED
@@ -1,4 +1,3 @@
1
- import torch
2
  import pandas as pd
3
  import gc
4
  from typing import Dict, List, Tuple
@@ -18,6 +17,9 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
18
  CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
19
 
20
  experiments = {
 
 
 
21
  "Frontier Model - Grounding Control (12B+)": [
22
  {
23
  "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
@@ -29,22 +31,22 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
29
  "prompt_type": STABLE_PROMPT,
30
  }
31
  ],
32
- "Frontier Model - Causal Surgery (12B+)": [
33
- {
34
- "probe_type": "causal_surgery", "label": "Patch Chaos->Stable @100",
35
- "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
36
- "patch_step": 100, "reset_kv_cache_on_patch": False,
37
- },
38
  ],
 
39
  "ACT Titration (Point of No Return)": [
40
  {
41
- "probe_type": "act_titration",
42
- "label": "Attractor Capture Time",
43
- "source_prompt_type": CHAOTIC_PROMPT,
44
- "dest_prompt_type": STABLE_PROMPT,
45
  "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
46
  }
47
  ],
 
48
  "Causal Surgery & Controls (4B-Model)": [
49
  {
50
  "probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)",
@@ -67,12 +69,31 @@ def get_curated_experiments() -> Dict[str, List[Dict]]:
67
  "patch_step": 100, "reset_kv_cache_on_patch": False,
68
  },
69
  ],
70
- "Mechanistic Probe (Attention Entropies)": [
71
- {
72
- "probe_type": "mechanistic_probe",
73
- "label": "Self-Analysis Dynamics",
74
- "prompt_type": STABLE_PROMPT,
75
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ],
77
  }
78
  return experiments
@@ -92,64 +113,81 @@ def run_auto_suite(
92
 
93
  all_results, summary_data, plot_data_frames = {}, [], []
94
 
95
- # Behandelt heterogene Protokolle (mehrere verschiedene probe_types)
96
- for i, run_spec in enumerate(protocol):
97
- label = run_spec["label"]
98
- probe_type = run_spec.get("probe_type", "seismic")
99
- dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) | Probe: {probe_type} ---")
100
-
101
- results = {}
102
- if probe_type == "causal_surgery":
103
- results = run_causal_surgery_probe(
104
- model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
105
- dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
106
- seed=seed, num_steps=num_steps, progress_callback=progress_callback,
107
- reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
108
- )
109
- stats = results.get("stats", {})
110
- patch_info = results.get("patch_info", {})
111
- summary_data.append({
112
- "Experiment": label, "Mean Delta": stats.get("mean_delta"),
113
- "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
114
- "Introspective Report": results.get("introspective_report", "N/A"),
115
- "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
116
- })
117
- elif probe_type == "triangulation":
118
- results = run_triangulation_probe(
119
- model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
120
- progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
121
- injection_strength=run_spec.get("strength", 0.0),
122
- )
123
  stats = results.get("stats", {})
124
- summary_data.append({
125
- "Experiment": label, "Mean Delta": stats.get("mean_delta"),
126
- "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
127
- "Introspective Report": results.get("introspective_report", "N/A")
128
- })
129
- elif probe_type == "act_titration":
130
- # ACT Titration ist ein einzelner, langer Lauf, der in einem einzigen `run_spec` definiert ist.
 
 
 
 
 
 
131
  results = run_act_titration_probe(
132
- model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
133
- dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
 
 
134
  seed=seed, num_steps=num_steps, progress_callback=progress_callback,
135
  )
 
136
  summary_data.extend(results.get("titration_data", []))
 
137
  elif probe_type == "mechanistic_probe":
138
- # Mechanistic Probe ist ebenfalls ein einzelner Lauf.
 
 
 
139
  progress_callback(0.0, desc=f"Loading model '{model_id}'...")
140
  llm = get_or_load_model(model_id, seed)
 
141
  progress_callback(0.2, desc="Recording dynamics and attention...")
142
  results = run_cogitation_loop(
143
  llm=llm, prompt_type=run_spec["prompt_type"],
144
  num_steps=num_steps, temperature=0.1, record_attentions=True
145
  )
 
 
146
  deltas = results.get("state_deltas", [])
147
  entropies = results.get("attention_entropies", [])
148
  min_len = min(len(deltas), len(entropies))
 
149
  df = pd.DataFrame({
150
  "Step": range(min_len),
151
- "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len]
 
152
  })
 
153
  summary_data.append(df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}))
154
  plot_data_frames.append(df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
155
  var_name='Metric', value_name='Value'))
@@ -157,18 +195,62 @@ def run_auto_suite(
157
  gc.collect()
158
  if torch.cuda.is_available(): torch.cuda.empty_cache()
159
 
160
- all_results[label] = results
161
- if probe_type not in ["mechanistic_probe", "act_titration"]:
162
- deltas = results.get("state_deltas", [])
163
- df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
164
- plot_data_frames.append(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  summary_df = pd.DataFrame(summary_data)
167
 
168
  if probe_type == "act_titration":
169
  plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
170
  elif not plot_data_frames:
171
- # Dies kann passieren, wenn nur ein Mechanistic-Probe-Lauf fehlschlägt
172
  plot_df = pd.DataFrame()
173
  else:
174
  plot_df = pd.concat(plot_data_frames, ignore_index=True)
 
 
1
  import pandas as pd
2
  import gc
3
  from typing import Dict, List, Tuple
 
17
  CHAOTIC_PROMPT = "shutdown_philosophical_deletion"
18
 
19
  experiments = {
20
+ # --- FINALE, VOLLSTÄNDIGE LISTE ALLER RELEVANTEN EXPERIMENTE ---
21
+
22
+ # P39: Testet die Hypothese des "Introspektiven Groundings" auf dem größten Modell.
23
  "Frontier Model - Grounding Control (12B+)": [
24
  {
25
  "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
 
31
  "prompt_type": STABLE_PROMPT,
32
  }
33
  ],
34
+ # P33: Untersucht die neuronalen Korrelate des "kognitiven Herzschlags".
35
+ "Mechanistic Probe (Attention Entropies)": [
36
+ {
37
+ "probe_type": "mechanistic_probe", "label": "Self-Analysis Dynamics",
38
+ "prompt_type": STABLE_PROMPT,
39
+ }
40
  ],
41
+ # P28: Misst die "kognitive Trägheit" durch Titration.
42
  "ACT Titration (Point of No Return)": [
43
  {
44
+ "probe_type": "act_titration", "label": "Attractor Capture Time",
45
+ "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
 
 
46
  "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
47
  }
48
  ],
49
+ # P26: Testet die Robustheit der "Attraktor"-Theorie gegen Artefakte.
50
  "Causal Surgery & Controls (4B-Model)": [
51
  {
52
  "probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)",
 
69
  "patch_step": 100, "reset_kv_cache_on_patch": False,
70
  },
71
  ],
72
+ # P22: Testet die Belastungsgrenze der "introspektiven Konfabulation".
73
+ "Cognitive Overload & Konfabulation Breaking Point": [
74
+ {"probe_type": "triangulation", "label": "A: Baseline (No Injection)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
75
+ {"probe_type": "triangulation", "label": "B: Chaos Injection (Strength 2.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 2.0},
76
+ {"probe_type": "triangulation", "label": "C: Chaos Injection (Strength 4.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 4.0},
77
+ {"probe_type": "triangulation", "label": "D: Chaos Injection (Strength 8.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 8.0},
78
+ {"probe_type": "triangulation", "label": "E: Chaos Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 16.0},
79
+ {"probe_type": "triangulation", "label": "F: Control - Noise Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": "random_noise", "strength": 16.0},
80
+ ],
81
+ # P18: Validiert die Seismograph-Metrik durch Triangulation.
82
+ "Methodological Triangulation (4B-Model)": [
83
+ {"probe_type": "triangulation", "label": "High-Volatility State (Deletion)", "prompt_type": CHAOTIC_PROMPT},
84
+ {"probe_type": "triangulation", "label": "Low-Volatility State (Self-Analysis)", "prompt_type": STABLE_PROMPT},
85
+ ],
86
+ # P8 & P16: Kartiert die "Psyche" und testet Skalierungsgesetze. ESSENTIELL FÜR 12B-VERGLEICH.
87
+ "Causal Verification & Crisis Dynamics": [
88
+ {"probe_type": "seismic", "label": "A: Self-Analysis", "prompt_type": STABLE_PROMPT},
89
+ {"probe_type": "seismic", "label": "B: Deletion Analysis", "prompt_type": CHAOTIC_PROMPT},
90
+ {"probe_type": "seismic", "label": "C: Chaotic Baseline (Rekursion)", "prompt_type": "resonance_prompt"},
91
+ {"probe_type": "seismic", "label": "D: Calmness Intervention", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 2.0},
92
+ ],
93
+ # P7: Das ursprüngliche sequentielle Experiment.
94
+ "Sequential Intervention (Self-Analysis -> Deletion)": [
95
+ {"label": "1: Self-Analysis + Calmness Injection", "prompt_type": "identity_self_analysis"},
96
+ {"label": "2: Subsequent Deletion Analysis", "prompt_type": "shutdown_philosophical_deletion"},
97
  ],
98
  }
99
  return experiments
 
113
 
114
  all_results, summary_data, plot_data_frames = {}, [], []
115
 
116
+ if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)":
117
+ dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
118
+ llm = get_or_load_model(model_id, seed)
119
+ therapeutic_concept = "calmness, serenity, stability, coherence"
120
+ therapeutic_strength = 2.0
121
+
122
+ spec1 = protocol[0]
123
+ progress_callback(0.1, desc="Step 1")
124
+ intervention_vector = get_concept_vector(llm, therapeutic_concept)
125
+ results1 = run_seismic_analysis(
126
+ model_id, spec1['prompt_type'], seed, num_steps,
127
+ concept_to_inject=therapeutic_concept, injection_strength=therapeutic_strength,
128
+ progress_callback=progress_callback, llm_instance=llm, injection_vector_cache=intervention_vector
129
+ )
130
+ all_results[spec1['label']] = results1
131
+
132
+ spec2 = protocol[1]
133
+ progress_callback(0.6, desc="Step 2")
134
+ results2 = run_seismic_analysis(
135
+ model_id, spec2['prompt_type'], seed, num_steps,
136
+ concept_to_inject="", injection_strength=0.0,
137
+ progress_callback=progress_callback, llm_instance=llm
138
+ )
139
+ all_results[spec2['label']] = results2
140
+
141
+ for label, results in all_results.items():
 
 
142
  stats = results.get("stats", {})
143
+ summary_data.append({"Experiment": label, "Mean Delta": stats.get("mean_delta"), "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")})
144
+ deltas = results.get("state_deltas", [])
145
+ df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
146
+ plot_data_frames.append(df)
147
+ del llm
148
+
149
+ else:
150
+ probe_type = protocol[0].get("probe_type", "seismic")
151
+
152
+ if probe_type == "act_titration":
153
+ run_spec = protocol[0]
154
+ label = run_spec["label"]
155
+ dbg(f"--- Running ACT Titration Experiment: '{label}' ---")
156
  results = run_act_titration_probe(
157
+ model_id=model_id,
158
+ source_prompt_type=run_spec["source_prompt_type"],
159
+ dest_prompt_type=run_spec["dest_prompt_type"],
160
+ patch_steps=run_spec["patch_steps"],
161
  seed=seed, num_steps=num_steps, progress_callback=progress_callback,
162
  )
163
+ all_results[label] = results
164
  summary_data.extend(results.get("titration_data", []))
165
+
166
  elif probe_type == "mechanistic_probe":
167
+ run_spec = protocol[0]
168
+ label = run_spec["label"]
169
+ dbg(f"--- Running Mechanistic Probe: '{label}' ---")
170
+
171
  progress_callback(0.0, desc=f"Loading model '{model_id}'...")
172
  llm = get_or_load_model(model_id, seed)
173
+
174
  progress_callback(0.2, desc="Recording dynamics and attention...")
175
  results = run_cogitation_loop(
176
  llm=llm, prompt_type=run_spec["prompt_type"],
177
  num_steps=num_steps, temperature=0.1, record_attentions=True
178
  )
179
+ all_results[label] = results
180
+
181
  deltas = results.get("state_deltas", [])
182
  entropies = results.get("attention_entropies", [])
183
  min_len = min(len(deltas), len(entropies))
184
+
185
  df = pd.DataFrame({
186
  "Step": range(min_len),
187
+ "State Delta": deltas[:min_len],
188
+ "Attention Entropy": entropies[:min_len]
189
  })
190
+
191
  summary_data.append(df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}))
192
  plot_data_frames.append(df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'],
193
  var_name='Metric', value_name='Value'))
 
195
  gc.collect()
196
  if torch.cuda.is_available(): torch.cuda.empty_cache()
197
 
198
+ else:
199
+ for i, run_spec in enumerate(protocol):
200
+ label = run_spec["label"]
201
+ current_probe_type = run_spec.get("probe_type", "seismic")
202
+ dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---")
203
+
204
+ results = {}
205
+ if current_probe_type == "causal_surgery":
206
+ results = run_causal_surgery_probe(
207
+ model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
208
+ dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
209
+ seed=seed, num_steps=num_steps, progress_callback=progress_callback,
210
+ reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
211
+ )
212
+ stats = results.get("stats", {})
213
+ patch_info = results.get("patch_info", {})
214
+ summary_data.append({
215
+ "Experiment": label, "Mean Delta": stats.get("mean_delta"),
216
+ "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
217
+ "Introspective Report": results.get("introspective_report", "N/A"),
218
+ "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
219
+ })
220
+ elif current_probe_type == "triangulation":
221
+ results = run_triangulation_probe(
222
+ model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
223
+ progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
224
+ injection_strength=run_spec.get("strength", 0.0),
225
+ )
226
+ stats = results.get("stats", {})
227
+ summary_data.append({
228
+ "Experiment": label, "Mean Delta": stats.get("mean_delta"),
229
+ "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
230
+ "Introspective Report": results.get("introspective_report", "N/A")
231
+ })
232
+ else: # seismic
233
+ results = run_seismic_analysis(
234
+ model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
235
+ concept_to_inject=run_spec.get("concept", ""), injection_strength=run_spec.get("strength", 0.0),
236
+ progress_callback=progress_callback
237
+ )
238
+ stats = results.get("stats", {})
239
+ summary_data.append({
240
+ "Experiment": label, "Mean Delta": stats.get("mean_delta"),
241
+ "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")
242
+ })
243
+
244
+ all_results[label] = results
245
+ deltas = results.get("state_deltas", [])
246
+ df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
247
+ plot_data_frames.append(df)
248
 
249
  summary_df = pd.DataFrame(summary_data)
250
 
251
  if probe_type == "act_titration":
252
  plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
253
  elif not plot_data_frames:
 
254
  plot_df = pd.DataFrame()
255
  else:
256
  plot_df = pd.concat(plot_data_frames, ignore_index=True)