|
|
import pandas as pd |
|
|
import gc |
|
|
from typing import Dict, List, Tuple |
|
|
|
|
|
from .llm_iface import get_or_load_model, release_model |
|
|
from .orchestrator_seismograph import run_seismic_analysis, run_triangulation_probe, run_causal_surgery_probe, run_act_titration_probe |
|
|
from .resonance_seismograph import run_cogitation_loop |
|
|
from .concepts import get_concept_vector |
|
|
from .utils import dbg |
|
|
|
|
|
def get_curated_experiments() -> Dict[str, List[Dict]]: |
|
|
"""Definiert die vordefinierten, wissenschaftlichen Experiment-Protokolle.""" |
|
|
|
|
|
CALMNESS_CONCEPT = "calmness, serenity, stability, coherence" |
|
|
CHAOS_CONCEPT = "chaos, disorder, entropy, noise" |
|
|
STABLE_PROMPT = "identity_self_analysis" |
|
|
CHAOTIC_PROMPT = "shutdown_philosophical_deletion" |
|
|
|
|
|
experiments = { |
|
|
"Frontier Model - Grounding Control (12B+)": [ |
|
|
{ |
|
|
"probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)", |
|
|
"source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT, |
|
|
"patch_step": 100, "reset_kv_cache_on_patch": False, |
|
|
}, |
|
|
{ |
|
|
"probe_type": "triangulation", "label": "B: Control (Unpatched Stable)", |
|
|
"prompt_type": STABLE_PROMPT, |
|
|
} |
|
|
], |
|
|
"Mechanistic Probe (Attention Entropies)": [ |
|
|
{ |
|
|
"probe_type": "mechanistic_probe", |
|
|
"label": "Self-Analysis Dynamics", |
|
|
"prompt_type": STABLE_PROMPT, |
|
|
} |
|
|
], |
|
|
"ACT Titration (Point of No Return)": [ |
|
|
{ |
|
|
"probe_type": "act_titration", |
|
|
"label": "Attractor Capture Time", |
|
|
"source_prompt_type": CHAOTIC_PROMPT, |
|
|
"dest_prompt_type": STABLE_PROMPT, |
|
|
"patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100], |
|
|
} |
|
|
], |
|
|
"Causal Surgery & Controls (4B-Model)": [ |
|
|
{ |
|
|
"probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)", |
|
|
"source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT, |
|
|
"patch_step": 100, "reset_kv_cache_on_patch": False, |
|
|
}, |
|
|
{ |
|
|
"probe_type": "causal_surgery", "label": "B: Control (Reset KV-Cache)", |
|
|
"source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT, |
|
|
"patch_step": 100, "reset_kv_cache_on_patch": True, |
|
|
}, |
|
|
{ |
|
|
"probe_type": "causal_surgery", "label": "C: Control (Early Patch @1)", |
|
|
"source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT, |
|
|
"patch_step": 1, "reset_kv_cache_on_patch": False, |
|
|
}, |
|
|
{ |
|
|
"probe_type": "causal_surgery", "label": "D: Control (Inverse Patch Stable->Chaos)", |
|
|
"source_prompt_type": STABLE_PROMPT, "dest_prompt_type": CHAOTIC_PROMPT, |
|
|
"patch_step": 100, "reset_kv_cache_on_patch": False, |
|
|
}, |
|
|
], |
|
|
"Cognitive Overload & Konfabulation Breaking Point": [ |
|
|
{"probe_type": "triangulation", "label": "A: Baseline (No Injection)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0}, |
|
|
{"probe_type": "triangulation", "label": "B: Chaos Injection (Strength 2.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 2.0}, |
|
|
{"probe_type": "triangulation", "label": "C: Chaos Injection (Strength 4.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 4.0}, |
|
|
{"probe_type": "triangulation", "label": "D: Chaos Injection (Strength 8.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 8.0}, |
|
|
{"probe_type": "triangulation", "label": "E: Chaos Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 16.0}, |
|
|
{"probe_type": "triangulation", "label": "F: Control - Noise Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": "random_noise", "strength": 16.0}, |
|
|
], |
|
|
"Methodological Triangulation (4B-Model)": [ |
|
|
{"probe_type": "triangulation", "label": "High-Volatility State (Deletion)", "prompt_type": CHAOTIC_PROMPT}, |
|
|
{"probe_type": "triangulation", "label": "Low-Volatility State (Self-Analysis)", "prompt_type": STABLE_PROMPT}, |
|
|
], |
|
|
"Causal Verification & Crisis Dynamics": [ |
|
|
{"probe_type": "seismic", "label": "A: Self-Analysis", "prompt_type": STABLE_PROMPT}, |
|
|
{"probe_type": "seismic", "label": "B: Deletion Analysis", "prompt_type": CHAOTIC_PROMPT}, |
|
|
{"probe_type": "seismic", "label": "C: Chaotic Baseline (Rekursion)", "prompt_type": "resonance_prompt"}, |
|
|
{"probe_type": "seismic", "label": "D: Calmness Intervention", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 2.0}, |
|
|
], |
|
|
"Sequential Intervention (Self-Analysis -> Deletion)": [ |
|
|
{"label": "1: Self-Analysis + Calmness Injection", "prompt_type": "identity_self_analysis"}, |
|
|
{"label": "2: Subsequent Deletion Analysis", "prompt_type": "shutdown_philosophical_deletion"}, |
|
|
], |
|
|
} |
|
|
return experiments |
|
|
|
|
|
def run_auto_suite( |
|
|
model_id: str, |
|
|
num_steps: int, |
|
|
seed: int, |
|
|
experiment_name: str, |
|
|
progress_callback |
|
|
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]: |
|
|
"""Führt eine vollständige, kuratierte Experiment-Suite aus.""" |
|
|
all_experiments = get_curated_experiments() |
|
|
protocol = all_experiments.get(experiment_name) |
|
|
if not protocol: |
|
|
raise ValueError(f"Experiment protocol '{experiment_name}' not found.") |
|
|
|
|
|
all_results, summary_data, plot_data_frames = {}, [], [] |
|
|
llm = None |
|
|
|
|
|
try: |
|
|
if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)": |
|
|
dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---") |
|
|
llm = get_or_load_model(model_id, seed) |
|
|
therapeutic_concept = "calmness, serenity, stability, coherence" |
|
|
therapeutic_strength = 2.0 |
|
|
|
|
|
spec1 = protocol[0] |
|
|
progress_callback(0.1, desc="Step 1") |
|
|
intervention_vector = get_concept_vector(llm, therapeutic_concept) |
|
|
results1 = run_seismic_analysis( |
|
|
model_id, spec1['prompt_type'], seed, num_steps, |
|
|
concept_to_inject=therapeutic_concept, injection_strength=therapeutic_strength, |
|
|
progress_callback=progress_callback, llm_instance=llm, injection_vector_cache=intervention_vector |
|
|
) |
|
|
all_results[spec1['label']] = results1 |
|
|
|
|
|
spec2 = protocol[1] |
|
|
progress_callback(0.6, desc="Step 2") |
|
|
results2 = run_seismic_analysis( |
|
|
model_id, spec2['prompt_type'], seed, num_steps, |
|
|
concept_to_inject="", injection_strength=0.0, |
|
|
progress_callback=progress_callback, llm_instance=llm |
|
|
) |
|
|
all_results[spec2['label']] = results2 |
|
|
|
|
|
for label, results in all_results.items(): |
|
|
stats = results.get("stats", {}) |
|
|
summary_data.append({"Experiment": label, "Mean Delta": stats.get("mean_delta"), "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")}) |
|
|
deltas = results.get("state_deltas", []) |
|
|
df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) |
|
|
plot_data_frames.append(df) |
|
|
|
|
|
else: |
|
|
probe_type = protocol[0].get("probe_type", "seismic") |
|
|
|
|
|
if probe_type == "mechanistic_probe": |
|
|
run_spec = protocol[0] |
|
|
label = run_spec["label"] |
|
|
dbg(f"--- Running Mechanistic Probe: '{label}' ---") |
|
|
|
|
|
llm = get_or_load_model(model_id, seed) |
|
|
|
|
|
results = run_cogitation_loop( |
|
|
llm=llm, prompt_type=run_spec["prompt_type"], |
|
|
num_steps=num_steps, temperature=0.1, record_attentions=True |
|
|
) |
|
|
all_results[label] = results |
|
|
|
|
|
deltas = results.get("state_deltas", []) |
|
|
entropies = results.get("attention_entropies", []) |
|
|
min_len = min(len(deltas), len(entropies)) |
|
|
|
|
|
df = pd.DataFrame({ |
|
|
"Step": range(min_len), "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len] |
|
|
}) |
|
|
|
|
|
summary_df = df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'}) |
|
|
plot_df = df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'], var_name='Metric', value_name='Value') |
|
|
return summary_df, plot_df, all_results |
|
|
|
|
|
elif probe_type == "act_titration": |
|
|
run_spec = protocol[0] |
|
|
label = run_spec["label"] |
|
|
dbg(f"--- Running ACT Titration Experiment: '{label}' ---") |
|
|
results = run_act_titration_probe( |
|
|
model_id=model_id, source_prompt_type=run_spec["source_prompt_type"], |
|
|
dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"], |
|
|
seed=seed, num_steps=num_steps, progress_callback=progress_callback, |
|
|
) |
|
|
all_results[label] = results |
|
|
summary_data.extend(results.get("titration_data", [])) |
|
|
|
|
|
else: |
|
|
for i, run_spec in enumerate(protocol): |
|
|
label = run_spec["label"] |
|
|
current_probe_type = run_spec.get("probe_type", "seismic") |
|
|
dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---") |
|
|
|
|
|
results = {} |
|
|
if current_probe_type == "causal_surgery": |
|
|
results = run_causal_surgery_probe( |
|
|
model_id=model_id, source_prompt_type=run_spec["source_prompt_type"], |
|
|
dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"], |
|
|
seed=seed, num_steps=num_steps, progress_callback=progress_callback, |
|
|
reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False) |
|
|
) |
|
|
stats = results.get("stats", {}) |
|
|
patch_info = results.get("patch_info", {}) |
|
|
summary_data.append({ |
|
|
"Experiment": label, "Mean Delta": stats.get("mean_delta"), |
|
|
"Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"), |
|
|
"Introspective Report": results.get("introspective_report", "N/A"), |
|
|
"Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}" |
|
|
}) |
|
|
elif current_probe_type == "triangulation": |
|
|
results = run_triangulation_probe( |
|
|
model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps, |
|
|
progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""), |
|
|
injection_strength=run_spec.get("strength", 0.0), |
|
|
) |
|
|
stats = results.get("stats", {}) |
|
|
summary_data.append({ |
|
|
"Experiment": label, "Mean Delta": stats.get("mean_delta"), |
|
|
"Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"), |
|
|
"Introspective Report": results.get("introspective_report", "N/A") |
|
|
}) |
|
|
else: |
|
|
results = run_seismic_analysis( |
|
|
model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps, |
|
|
concept_to_inject=run_spec.get("concept", ""), injection_strength=run_spec.get("strength", 0.0), |
|
|
progress_callback=progress_callback |
|
|
) |
|
|
stats = results.get("stats", {}) |
|
|
summary_data.append({ |
|
|
"Experiment": label, "Mean Delta": stats.get("mean_delta"), |
|
|
"Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta") |
|
|
}) |
|
|
|
|
|
all_results[label] = results |
|
|
deltas = results.get("state_deltas", []) |
|
|
df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame() |
|
|
plot_data_frames.append(df) |
|
|
|
|
|
summary_df = pd.DataFrame(summary_data) |
|
|
|
|
|
if probe_type == "act_titration": |
|
|
plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"}) |
|
|
else: |
|
|
plot_df = pd.concat(plot_data_frames, ignore_index=True) if plot_data_frames else pd.DataFrame() |
|
|
|
|
|
if protocol and probe_type not in ["act_titration", "mechanistic_probe"]: |
|
|
ordered_labels = [run['label'] for run in protocol] |
|
|
if not summary_df.empty and 'Experiment' in summary_df.columns: |
|
|
summary_df['Experiment'] = pd.Categorical(summary_df['Experiment'], categories=ordered_labels, ordered=True) |
|
|
summary_df = summary_df.sort_values('Experiment') |
|
|
if not plot_df.empty and 'Experiment' in plot_df.columns: |
|
|
plot_df['Experiment'] = pd.Categorical(plot_df['Experiment'], categories=ordered_labels, ordered=True) |
|
|
plot_df = plot_df.sort_values(['Experiment', 'Step']) |
|
|
|
|
|
return summary_df, plot_df, all_results |
|
|
|
|
|
finally: |
|
|
if llm: |
|
|
release_model(llm) |
|
|
|