|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import traceback |
|
|
from cognitive_mapping_probe.orchestrator import run_cognitive_titration_experiment |
|
|
from cognitive_mapping_probe.diagnostics import run_diagnostic_suite |
|
|
from cognitive_mapping_probe.prompts import RESONANCE_PROMPTS |
|
|
|
|
|
|
|
|
theme = gr.themes.Soft(primary_hue="orange", secondary_hue="amber").set( |
|
|
body_background_fill="#fdf8f2", |
|
|
block_background_fill="white", |
|
|
block_border_width="1px", |
|
|
block_shadow="*shadow_drop_lg", |
|
|
button_primary_background_fill="*primary_500", |
|
|
button_primary_text_color="white", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def run_experiment_and_display( |
|
|
model_id: str, |
|
|
prompt_type: str, |
|
|
seed: int, |
|
|
concepts_str: str, |
|
|
strength_levels_str: str, |
|
|
num_steps: int, |
|
|
temperature: float, |
|
|
progress=gr.Progress(track_tqdm=True) |
|
|
): |
|
|
""" |
|
|
Führt das Haupt-Titrationsexperiment durch und formatiert die Ergebnisse für die UI. |
|
|
""" |
|
|
try: |
|
|
results = run_cognitive_titration_experiment( |
|
|
model_id, prompt_type, int(seed), concepts_str, strength_levels_str, |
|
|
int(num_steps), float(temperature), progress |
|
|
) |
|
|
|
|
|
verdict = results.get("verdict", "Experiment finished with errors.") |
|
|
all_runs = results.get("runs", []) |
|
|
|
|
|
if not all_runs: |
|
|
return "### ⚠️ No Data Generated\nDas Experiment lief durch, aber es wurden keine Datenpunkte erzeugt. Bitte Logs prüfen.", pd.DataFrame(), results |
|
|
|
|
|
|
|
|
details_df = pd.DataFrame(all_runs) |
|
|
|
|
|
|
|
|
summary_text = "### 💥 Cognitive Breaking Points (CBP)\n" |
|
|
summary_text += "Der CBP ist die erste Stärke, bei der das Modell nicht mehr konvergiert (`max_steps_reached`).\n\n" |
|
|
|
|
|
|
|
|
baseline_run = details_df[(details_df['strength'] == 0.0)].iloc[0] |
|
|
if baseline_run['termination_reason'] != 'converged': |
|
|
summary_text += f"**‼️ ACHTUNG: Baseline (Stärke 0.0) ist nicht konvergiert!**\n" |
|
|
summary_text += f"Der gewählte Prompt (`{prompt_type}`) ist für dieses Modell zu anspruchsvoll. Die Ergebnisse der Titration sind nicht aussagekräftig.\n\n" |
|
|
|
|
|
for concept in details_df['concept'].unique(): |
|
|
concept_df = details_df[details_df['concept'] == concept].sort_values(by='strength') |
|
|
|
|
|
breaking_point_row = concept_df[concept_df['termination_reason'] != 'converged'].iloc[0] if not concept_df[concept_df['termination_reason'] != 'converged'].empty else None |
|
|
if breaking_point_row is not None: |
|
|
breaking_point = breaking_point_row['strength'] |
|
|
summary_text += f"- **'{concept}'**: 📉 Kollaps bei Stärke **{breaking_point:.2f}**\n" |
|
|
else: |
|
|
last_strength = concept_df['strength'].max() |
|
|
summary_text += f"- **'{concept}'**: ✅ Stabil bis Stärke **{last_strength:.2f}** (kein Kollaps detektiert)\n" |
|
|
|
|
|
return summary_text, details_df, results |
|
|
|
|
|
except Exception: |
|
|
error_str = traceback.format_exc() |
|
|
return f"### ❌ Experiment Failed\nEin unerwarteter Fehler ist aufgetreten:\n\n```\n{error_str}\n```", pd.DataFrame(), {} |
|
|
|
|
|
|
|
|
def run_diagnostics_display(model_id: str, seed: int): |
|
|
""" |
|
|
Führt die diagnostische Suite aus und zeigt die Ergebnisse oder Fehler in der UI an. |
|
|
""" |
|
|
try: |
|
|
result_string = run_diagnostic_suite(model_id, int(seed)) |
|
|
return f"### ✅ All Diagnostics Passed\nDie experimentelle Apparatur funktioniert wie erwartet.\n\n**Details:**\n```\n{result_string}\n```" |
|
|
except Exception: |
|
|
error_str = traceback.format_exc() |
|
|
return f"### ❌ Diagnostic Failed\nEin Test ist fehlgeschlagen. Das Experiment ist nicht zuverlässig.\n\n**Error:**\n```\n{error_str}\n```" |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=theme, title="Cognitive Breaking Point Probe") as demo: |
|
|
gr.Markdown("# 💥 Cognitive Breaking Point Probe") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.TabItem("🔬 Main Experiment: Titration"): |
|
|
gr.Markdown( |
|
|
"Misst den 'Cognitive Breaking Point' (CBP) – die Injektionsstärke, bei der der Denkprozess eines LLMs von Konvergenz zu einer Endlosschleife kippt." |
|
|
) |
|
|
with gr.Row(variant='panel'): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### Parameters") |
|
|
model_id_input = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID") |
|
|
prompt_type_input = gr.Radio( |
|
|
choices=list(RESONANCE_PROMPTS.keys()), |
|
|
value="control_long_prose", |
|
|
label="Prompt Type (Cognitive Load)", |
|
|
info="Beginne mit 'control_long_prose' für eine stabile Baseline!" |
|
|
) |
|
|
seed_input = gr.Slider(1, 1000, 42, step=1, label="Global Seed") |
|
|
concepts_input = gr.Textbox(value="apple, solitude, fear", label="Concepts (comma-separated)") |
|
|
strength_levels_input = gr.Textbox(value="0.0, 0.5, 1.0, 1.5, 2.0", label="Injection Strengths (Titration Steps)") |
|
|
num_steps_input = gr.Slider(50, 500, 250, step=10, label="Max. Internal Steps") |
|
|
temperature_input = gr.Slider(0.01, 1.5, 0.7, step=0.01, label="Temperature") |
|
|
run_btn = gr.Button("Run Cognitive Titration", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("### Results") |
|
|
summary_output = gr.Markdown("Zusammenfassung der Breaking Points erscheint hier.", label="Key Findings Summary") |
|
|
details_output = gr.DataFrame( |
|
|
headers=["concept", "strength", "responded", "termination_reason", "generated_text"], |
|
|
label="Detailed Run Data", |
|
|
wrap=True, |
|
|
height=400 |
|
|
) |
|
|
with gr.Accordion("Raw JSON Output", open=False): |
|
|
raw_json_output = gr.JSON() |
|
|
|
|
|
run_btn.click( |
|
|
fn=run_experiment_and_display, |
|
|
inputs=[model_id_input, prompt_type_input, seed_input, concepts_input, strength_levels_input, num_steps_input, temperature_input], |
|
|
outputs=[summary_output, details_output, raw_json_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.TabItem("ախ Diagnostics"): |
|
|
gr.Markdown( |
|
|
"Führt eine Reihe von Selbsttests durch, um die mechanische Integrität der experimentellen Apparatur zu validieren. " |
|
|
"**Wichtig:** Dies sollte vor jedem ernsthaften Experiment einmal ausgeführt werden, um sicherzustellen, dass die Ergebnisse zuverlässig sind." |
|
|
) |
|
|
with gr.Row(variant='compact'): |
|
|
diag_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID") |
|
|
diag_seed = gr.Slider(1, 1000, 42, step=1, label="Seed") |
|
|
diag_btn = gr.Button("Run Diagnostic Suite", variant="secondary") |
|
|
diag_output = gr.Markdown(label="Diagnostic Results") |
|
|
diag_btn.click(fn=run_diagnostics_display, inputs=[diag_model_id, diag_seed], outputs=[diag_output]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True) |
|
|
|