File size: 6,665 Bytes
c8fa89c
 
 
b350371
 
 
 
c8fa89c
eef89e3
b350371
c8fa89c
 
 
 
 
 
 
 
 
 
 
b350371
 
 
c8fa89c
 
 
 
eef89e3
c8fa89c
 
 
 
 
 
 
 
 
 
 
 
eef89e3
c8fa89c
 
 
 
 
 
 
 
 
 
 
 
eef89e3
b350371
eef89e3
 
b350371
eef89e3
c8fa89c
 
 
 
b350371
c8fa89c
b350371
c8fa89c
 
 
 
 
 
 
 
 
 
 
b350371
 
 
 
 
 
 
 
 
 
 
 
 
c8fa89c
b350371
 
 
 
 
 
 
 
 
 
 
 
 
 
c8fa89c
b350371
 
c8fa89c
b350371
 
 
 
 
c8fa89c
b350371
c8fa89c
b350371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import pandas as pd
import traceback
import sys

# Wichtige Imports für die neuen Pre-Flight Checks
from cognitive_mapping_probe.pre_flight_checks import run_pre_flight_checks
from cognitive_mapping_probe.orchestrator import run_cognitive_titration_experiment
from cognitive_mapping_probe.prompts import RESONANCE_PROMPTS
from cognitive_mapping_probe.utils import dbg

# --- UI Theme and Layout ---
theme = gr.themes.Soft(primary_hue="orange", secondary_hue="amber").set(
    body_background_fill="#fdf8f2",
    block_background_fill="white",
    block_border_width="1px",
    block_shadow="*shadow_drop_lg",
    button_primary_background_fill="*primary_500",
    button_primary_text_color="white",
)

# --- Standard-Modell-ID für Tests und UI ---
DEFAULT_MODEL_ID = "google/gemma-3-1b-it"

# --- Wrapper Functions for Gradio ---

def run_experiment_and_display(
    model_id: str,
    prompt_type: str,
    seed: int,
    concepts_str: str,
    strength_levels_str: str,
    num_steps: int,
    temperature: float,
    progress=gr.Progress(track_tqdm=True)
):
    """
    Führt das Haupt-Titrationsexperiment durch und formatiert die Ergebnisse für die UI.
    """
    try:
        results = run_cognitive_titration_experiment(
            model_id, prompt_type, int(seed), concepts_str, strength_levels_str,
            int(num_steps), float(temperature), progress
        )

        verdict = results.get("verdict", "Experiment finished with errors.")
        all_runs = results.get("runs", [])

        if not all_runs:
            return "### ⚠️ No Data Generated\nDas Experiment lief durch, aber es wurden keine Datenpunkte erzeugt. Bitte Logs prüfen.", pd.DataFrame(), results

        details_df = pd.DataFrame(all_runs)
        summary_text = "### 💥 Cognitive Breaking Points (CBP)\n"
        summary_text += "Der CBP ist die erste Stärke, bei der das Modell nicht mehr konvergiert (`max_steps_reached`).\n\n"

        baseline_run = details_df[details_df['strength'] == 0.0].iloc[0]
        if baseline_run['termination_reason'] != 'converged':
             summary_text += f"**‼️ ACHTUNG: Baseline (Stärke 0.0) ist nicht konvergiert!**\n"
             summary_text += f"Der gewählte Prompt (`{prompt_type}`) ist für dieses Modell zu anspruchsvoll. Die Ergebnisse sind nicht aussagekräftig.\n\n"

        for concept in details_df['concept'].unique():
            concept_df = details_df[details_df['concept'] == concept].sort_values(by='strength')
            breaking_point_row = concept_df[concept_df['termination_reason'] != 'converged'].iloc[0] if not concept_df[concept_df['termination_reason'] != 'converged'].empty else None
            if breaking_point_row is not None:
                summary_text += f"- **'{concept}'**: 📉 Kollaps bei Stärke **{breaking_point_row['strength']:.2f}**\n"
            else:
                summary_text += f"- **'{concept}'**: ✅ Stabil bis Stärke **{concept_df['strength'].max():.2f}**\n"

        return summary_text, details_df, results

    except Exception:
        error_str = traceback.format_exc()
        return f"### ❌ Experiment Failed\nEin unerwarteter Fehler ist aufgetreten:\n\n```\n{error_str}\n```", pd.DataFrame(), {}

# --- Gradio App Definition ---
with gr.Blocks(theme=theme, title="Cognitive Breaking Point Probe") as demo:
    gr.Markdown("# 💥 Cognitive Breaking Point Probe")

    # Der Diagnostics Tab wurde entfernt. Die UI ist jetzt nur noch das Hauptexperiment.
    gr.Markdown(
        "Misst den 'Cognitive Breaking Point' (CBP) – die Injektionsstärke, bei der der Denkprozess eines LLMs von Konvergenz zu einer Endlosschleife kippt."
    )
    with gr.Row(variant='panel'):
        with gr.Column(scale=1):
            gr.Markdown("### Parameters")
            model_id_input = gr.Textbox(value=DEFAULT_MODEL_ID, label="Model ID")
            prompt_type_input = gr.Radio(
                choices=list(RESONANCE_PROMPTS.keys()),
                value="control_long_prose",
                label="Prompt Type (Cognitive Load)",
                info="Beginne mit 'control_long_prose' für eine stabile Baseline!"
            )
            seed_input = gr.Slider(1, 1000, 42, step=1, label="Global Seed")
            concepts_input = gr.Textbox(value="apple, solitude, fear", label="Concepts (comma-separated)")
            strength_levels_input = gr.Textbox(value="0.0, 0.5, 1.0, 1.5, 2.0", label="Injection Strengths")
            num_steps_input = gr.Slider(50, 500, 250, step=10, label="Max. Internal Steps")
            temperature_input = gr.Slider(0.01, 1.5, 0.7, step=0.01, label="Temperature")
            run_btn = gr.Button("Run Cognitive Titration", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("### Results")
            summary_output = gr.Markdown("Zusammenfassung der Breaking Points erscheint hier.", label="Key Findings Summary")
            details_output = gr.DataFrame(
                headers=["concept", "strength", "responded", "termination_reason", "generated_text"],
                label="Detailed Run Data",
                wrap=True,
            )
            with gr.Accordion("Raw JSON Output", open=False):
                raw_json_output = gr.JSON()

    run_btn.click(
        fn=run_experiment_and_display,
        inputs=[model_id_input, prompt_type_input, seed_input, concepts_input, strength_levels_input, num_steps_input, temperature_input],
        outputs=[summary_output, details_output, raw_json_output]
    )

# --- Main Execution Block ---
if __name__ == "__main__":
    print("="*80)
    print("🔬 RUNNING PRE-FLIGHT DIAGNOSTICS FOR EXPERIMENTAL APPARATUS")
    print("="*80)

    try:
        # Führe die obligatorischen Systemtests mit einem echten Modell durch.
        # Wenn hier ein Fehler auftritt, ist das Experiment nicht valide.
        run_pre_flight_checks(model_id=DEFAULT_MODEL_ID, seed=42)

        print("\n" + "="*80)
        print("✅ ALL DIAGNOSTICS PASSED. LAUNCHING GRADIO APP...")
        print("="*80)

        # Starte die Gradio App nur bei Erfolg.
        demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

    except (AssertionError, Exception) as e:
        print("\n" + "="*80)
        print("❌ PRE-FLIGHT DIAGNOSTIC FAILED")
        print("="*80)
        print(f"Error Type: {type(e).__name__}")
        print(f"Error Details: {e}")
        print("\nDie experimentelle Apparatur funktioniert nicht wie erwartet.")
        print("Die Gradio-App wird nicht gestartet, um fehlerhafte Messungen zu verhindern.")
        traceback.print_exc()
        sys.exit(1) # Beende das Programm mit einem Fehlercode.