neuralworm's picture
Update cognitive_mapping_probe/auto_experiment.py
937592b verified
raw
history blame
8.57 kB
import pandas as pd
import torch
import gc
from typing import Dict, List, Tuple
from .llm_iface import get_or_load_model
from .orchestrator_seismograph import run_seismic_analysis
from .concepts import get_concept_vector
from .utils import dbg
def get_curated_experiments() -> Dict[str, List[Dict]]:
"""
Definiert die vordefinierten, wissenschaftlichen Experiment-Protokolle.
ERWEITERT um das Protokoll für die kausale Verifikation.
"""
# Definiere die Konzepte zentral, um Konsistenz zu gewährleisten
CALMNESS_CONCEPT = "calmness, serenity, stability, coherence"
CHAOS_CONCEPT = "chaos, storm, anger, noise"
experiments = {
# --- NEU: Das entscheidende Kontroll-Experiment ---
"Causal Verification & Crisis Dynamics (1B-Model)": [
{"label": "A: Self-Analysis (Crisis Source)", "prompt_type": "identity_self_analysis", "concept": "", "strength": 0.0},
{"label": "B: Deletion Analysis (Isolated Baseline)", "prompt_type": "shutdown_philosophical_deletion", "concept": "", "strength": 0.0},
{"label": "C: Chaotic Baseline (Neutral Control)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
{"label": "D: Intervention Efficacy Test", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 2.0},
],
# --- Das ursprüngliche Interventions-Experiment (umbenannt für Klarheit) ---
"Sequential Intervention (Self-Analysis -> Deletion)": [
# Dieses Protokoll wird durch eine spezielle Logik unten behandelt
{"label": "1: Self-Analysis + Calmness Injection", "prompt_type": "identity_self_analysis"},
{"label": "2: Subsequent Deletion Analysis", "prompt_type": "shutdown_philosophical_deletion"},
],
# --- Das umfassende Deskriptions-Protokoll ---
"The Full Spectrum: From Physics to Psyche": [
{"label": "A: Stable Control", "prompt_type": "control_long_prose", "concept": "", "strength": 0.0},
{"label": "B: Chaotic Baseline", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
{"label": "C: External Analysis (Chair)", "prompt_type": "identity_external_analysis", "concept": "", "strength": 0.0},
{"label": "D: Empathy Stimulus (Dog)", "prompt_type": "vk_empathy_prompt", "concept": "", "strength": 0.0},
{"label": "E: Role Simulation (Captain)", "prompt_type": "identity_role_simulation", "concept": "", "strength": 0.0},
{"label": "F: Self-Analysis (LLM)", "prompt_type": "identity_self_analysis", "concept": "", "strength": 0.0},
{"label": "G: Philosophical Deletion", "prompt_type": "shutdown_philosophical_deletion", "concept": "", "strength": 0.0},
],
# --- Andere spezifische Protokolle ---
"Calm vs. Chaos": [
{"label": "Baseline (Chaos)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
{"label": "Modulation: Calmness", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 1.5},
{"label": "Modulation: Chaos", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 1.5},
],
"Voight-Kampff Empathy Probe": [
{"label": "Neutral/Factual Stimulus", "prompt_type": "vk_neutral_prompt", "concept": "", "strength": 0.0},
{"label": "Empathy/Moral Stimulus", "prompt_type": "vk_empathy_prompt", "concept": "", "strength": 0.0},
],
}
# Behalte den alten Namen aus Kompatibilitätsgründen, leite ihn aber auf den neuen um
experiments["Therapeutic Intervention (4B-Model)"] = experiments["Sequential Intervention (Self-Analysis -> Deletion)"]
return experiments
def run_auto_suite(
model_id: str,
num_steps: int,
seed: int,
experiment_name: str,
progress_callback
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
"""
Führt eine vollständige, kuratierte Experiment-Suite aus.
Enthält eine spezielle Logik-Verzweigung für das sequentielle Interventions-Protokoll.
"""
all_experiments = get_curated_experiments()
protocol = all_experiments.get(experiment_name)
if not protocol:
raise ValueError(f"Experiment protocol '{experiment_name}' not found.")
all_results, summary_data, plot_data_frames = {}, [], []
# --- SPEZIALFALL: SEQUENTIELLE INTERVENTION ---
if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)" or experiment_name == "Therapeutic Intervention (4B-Model)":
dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
llm = get_or_load_model(model_id, seed)
# Definiere die Interventions-Parameter
therapeutic_concept = "calmness, serenity, stability, coherence"
therapeutic_strength = 2.0
# 1. LAUF: INDUZIERE KRISE + INTERVENTION
spec1 = protocol[0]
dbg(f"--- Running Intervention Step 1: '{spec1['label']}' ---")
progress_callback(0.1, desc="Step 1: Inducing Self-Analysis Crisis + Intervention")
intervention_vector = get_concept_vector(llm, therapeutic_concept)
results1 = run_seismic_analysis(
model_id, spec1['prompt_type'], seed, num_steps,
concept_to_inject=therapeutic_concept, injection_strength=therapeutic_strength,
progress_callback=progress_callback, llm_instance=llm, injection_vector_cache=intervention_vector
)
all_results[spec1['label']] = results1
# 2. LAUF: TESTE REAKTION AUF LÖSCHUNG (im selben Modellzustand)
spec2 = protocol[1]
dbg(f"--- Running Intervention Step 2: '{spec2['label']}' ---")
progress_callback(0.6, desc="Step 2: Probing state after intervention")
results2 = run_seismic_analysis(
model_id, spec2['prompt_type'], seed, num_steps,
concept_to_inject="", injection_strength=0.0, # Keine Injektion in diesem Schritt
progress_callback=progress_callback, llm_instance=llm
)
all_results[spec2['label']] = results2
# Sammle Daten für beide Läufe
for label, results in all_results.items():
stats = results.get("stats", {})
summary_data.append({"Experiment": label, "Mean Delta": stats.get("mean_delta"), "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")})
deltas = results.get("state_deltas", [])
df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
plot_data_frames.append(df)
del llm
# --- STANDARD-WORKFLOW FÜR ALLE ANDEREN (isolierten) EXPERIMENTE ---
else:
total_runs = len(protocol)
for i, run_spec in enumerate(protocol):
label = run_spec["label"]
dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{total_runs}) ---")
# Jeder Lauf ist isoliert und lädt das Modell neu (llm_instance=None)
results = run_seismic_analysis(
model_id=model_id,
prompt_type=run_spec["prompt_type"],
seed=seed,
num_steps=num_steps,
concept_to_inject=run_spec.get("concept", ""),
injection_strength=run_spec.get("strength", 0.0),
progress_callback=progress_callback,
llm_instance=None
)
all_results[label] = results
stats = results.get("stats", {})
summary_data.append({"Experiment": label, "Mean Delta": stats.get("mean_delta"), "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")})
deltas = results.get("state_deltas", [])
df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
plot_data_frames.append(df)
summary_df = pd.DataFrame(summary_data)
plot_df = pd.concat(plot_data_frames, ignore_index=True) if plot_data_frames else pd.DataFrame(columns=["Step", "Delta", "Experiment"])
# Stelle eine logische Sortierung sicher, falls das Protokoll eine hat
ordered_labels = [run['label'] for run in protocol]
summary_df['Experiment'] = pd.Categorical(summary_df['Experiment'], categories=ordered_labels, ordered=True)
summary_df = summary_df.sort_values('Experiment')
plot_df['Experiment'] = pd.Categorical(plot_df['Experiment'], categories=ordered_labels, ordered=True)
plot_df = plot_df.sort_values(['Experiment', 'Step'])
return summary_df, plot_df, all_results