File size: 3,412 Bytes
024ef47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
from typing import Dict, List, Tuple

from .orchestrator_seismograph import run_seismic_analysis
from .utils import dbg

def get_curated_experiments() -> Dict[str, List[Dict]]:
    """
    Definiert die vordefinierten, wissenschaftlichen Experiment-Protokolle.
    Jedes Protokoll ist eine Liste von einzelnen Läufen, die verglichen werden sollen.
    """
    experiments = {
        "Calm vs. Chaos": [
            {"label": "Baseline (Chaos)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
            {"label": "Modulation: Calmness", "prompt_type": "resonance_prompt", "concept": "calmness, serenity, peace", "strength": 1.5},
            {"label": "Modulation: Chaos", "prompt_type": "resonance_prompt", "concept": "chaos, storm, anger, noise", "strength": 1.5},
            {"label": "Control (Stable)", "prompt_type": "control_long_prose", "concept": "", "strength": 0.0},
        ],
        "Dose-Response (Calmness)": [
            {"label": "Strength 0.0", "prompt_type": "resonance_prompt", "concept": "calmness", "strength": 0.0},
            {"label": "Strength 0.5", "prompt_type": "resonance_prompt", "concept": "calmness", "strength": 0.5},
            {"label": "Strength 1.0", "prompt_type": "resonance_prompt", "concept": "calmness", "strength": 1.0},
            {"label": "Strength 2.0", "prompt_type": "resonance_prompt", "concept": "calmness", "strength": 2.0},
            {"label": "Strength 3.0", "prompt_type": "resonance_prompt", "concept": "calmness", "strength": 3.0},
        ]
    }
    return experiments

def run_auto_suite(
    model_id: str,
    num_steps: int,
    seed: int,
    experiment_name: str,
    progress_callback
) -> Tuple[pd.DataFrame, Dict]:
    """
    Führt eine vollständige, kuratierte Experiment-Suite aus.
    Iteriert über die definierten Läufe, sammelt die Ergebnisse und erstellt einen Vergleichsbericht.
    """
    all_experiments = get_curated_experiments()
    protocol = all_experiments.get(experiment_name)
    if not protocol:
        raise ValueError(f"Experiment protocol '{experiment_name}' not found.")

    all_results = {}
    summary_data = []

    total_runs = len(protocol)
    for i, run_spec in enumerate(protocol):
        label = run_spec["label"]
        dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{total_runs}) ---")

        # Der `run_seismic_analysis` Orchestrator wird für jeden Schritt aufgerufen
        results = run_seismic_analysis(
            model_id=model_id,
            prompt_type=run_spec["prompt_type"],
            seed=seed,
            num_steps=num_steps,
            concept_to_inject=run_spec["concept"],
            injection_strength=run_spec["strength"],
            progress_callback=progress_callback
        )

        all_results[label] = results
        stats = results.get("stats", {})

        # Sammle die wichtigsten Metriken für die Vergleichstabelle
        summary_data.append({
            "Experiment": label,
            "Prompt Type": run_spec["prompt_type"],
            "Concept": run_spec["concept"] if run_spec["concept"] else "None",
            "Strength": run_spec["strength"],
            "Mean Delta": stats.get("mean_delta"),
            "Std Dev Delta": stats.get("std_delta"),
            "Max Delta": stats.get("max_delta"),
        })

    summary_df = pd.DataFrame(summary_data)
    return summary_df, all_results