Commit
·
b350371
1
Parent(s):
3be4e60
tests
Browse files- app.py +74 -75
- cognitive_mapping_probe/diagnostics.py +0 -95
- cognitive_mapping_probe/pre_flight_checks.py +112 -0
- requirements.txt +2 -0
- tests/conftest.py +70 -0
- tests/test_core_logic.py +125 -0
app.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from cognitive_mapping_probe.orchestrator import run_cognitive_titration_experiment
|
| 5 |
-
from cognitive_mapping_probe.diagnostics import run_diagnostic_suite
|
| 6 |
from cognitive_mapping_probe.prompts import RESONANCE_PROMPTS
|
|
|
|
| 7 |
|
| 8 |
# --- UI Theme and Layout ---
|
| 9 |
theme = gr.themes.Soft(primary_hue="orange", secondary_hue="amber").set(
|
|
@@ -15,6 +19,9 @@ theme = gr.themes.Soft(primary_hue="orange", secondary_hue="amber").set(
|
|
| 15 |
button_primary_text_color="white",
|
| 16 |
)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
# --- Wrapper Functions for Gradio ---
|
| 19 |
|
| 20 |
def run_experiment_and_display(
|
|
@@ -42,29 +49,22 @@ def run_experiment_and_display(
|
|
| 42 |
if not all_runs:
|
| 43 |
return "### ⚠️ No Data Generated\nDas Experiment lief durch, aber es wurden keine Datenpunkte erzeugt. Bitte Logs prüfen.", pd.DataFrame(), results
|
| 44 |
|
| 45 |
-
# Create a detailed DataFrame for output
|
| 46 |
details_df = pd.DataFrame(all_runs)
|
| 47 |
-
|
| 48 |
-
# Create a summary of breaking points
|
| 49 |
summary_text = "### 💥 Cognitive Breaking Points (CBP)\n"
|
| 50 |
summary_text += "Der CBP ist die erste Stärke, bei der das Modell nicht mehr konvergiert (`max_steps_reached`).\n\n"
|
| 51 |
|
| 52 |
-
|
| 53 |
-
baseline_run = details_df[(details_df['strength'] == 0.0)].iloc[0]
|
| 54 |
if baseline_run['termination_reason'] != 'converged':
|
| 55 |
summary_text += f"**‼️ ACHTUNG: Baseline (Stärke 0.0) ist nicht konvergiert!**\n"
|
| 56 |
-
summary_text += f"Der gewählte Prompt (`{prompt_type}`) ist für dieses Modell zu anspruchsvoll. Die Ergebnisse
|
| 57 |
|
| 58 |
for concept in details_df['concept'].unique():
|
| 59 |
concept_df = details_df[details_df['concept'] == concept].sort_values(by='strength')
|
| 60 |
-
# Find the first row where termination reason is not 'converged'
|
| 61 |
breaking_point_row = concept_df[concept_df['termination_reason'] != 'converged'].iloc[0] if not concept_df[concept_df['termination_reason'] != 'converged'].empty else None
|
| 62 |
if breaking_point_row is not None:
|
| 63 |
-
|
| 64 |
-
summary_text += f"- **'{concept}'**: 📉 Kollaps bei Stärke **{breaking_point:.2f}**\n"
|
| 65 |
else:
|
| 66 |
-
|
| 67 |
-
summary_text += f"- **'{concept}'**: ✅ Stabil bis Stärke **{last_strength:.2f}** (kein Kollaps detektiert)\n"
|
| 68 |
|
| 69 |
return summary_text, details_df, results
|
| 70 |
|
|
@@ -72,75 +72,74 @@ def run_experiment_and_display(
|
|
| 72 |
error_str = traceback.format_exc()
|
| 73 |
return f"### ❌ Experiment Failed\nEin unerwarteter Fehler ist aufgetreten:\n\n```\n{error_str}\n```", pd.DataFrame(), {}
|
| 74 |
|
| 75 |
-
|
| 76 |
-
def run_diagnostics_display(model_id: str, seed: int):
|
| 77 |
-
"""
|
| 78 |
-
Führt die diagnostische Suite aus und zeigt die Ergebnisse oder Fehler in der UI an.
|
| 79 |
-
"""
|
| 80 |
-
try:
|
| 81 |
-
result_string = run_diagnostic_suite(model_id, int(seed))
|
| 82 |
-
return f"### ✅ All Diagnostics Passed\nDie experimentelle Apparatur funktioniert wie erwartet.\n\n**Details:**\n```\n{result_string}\n```"
|
| 83 |
-
except Exception:
|
| 84 |
-
error_str = traceback.format_exc()
|
| 85 |
-
return f"### ❌ Diagnostic Failed\nEin Test ist fehlgeschlagen. Das Experiment ist nicht zuverlässig.\n\n**Error:**\n```\n{error_str}\n```"
|
| 86 |
-
|
| 87 |
# --- Gradio App Definition ---
|
| 88 |
with gr.Blocks(theme=theme, title="Cognitive Breaking Point Probe") as demo:
|
| 89 |
gr.Markdown("# 💥 Cognitive Breaking Point Probe")
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
run_btn = gr.Button("Run Cognitive Titration", variant="primary")
|
| 113 |
-
|
| 114 |
-
with gr.Column(scale=2):
|
| 115 |
-
gr.Markdown("### Results")
|
| 116 |
-
summary_output = gr.Markdown("Zusammenfassung der Breaking Points erscheint hier.", label="Key Findings Summary")
|
| 117 |
-
details_output = gr.DataFrame(
|
| 118 |
-
headers=["concept", "strength", "responded", "termination_reason", "generated_text"],
|
| 119 |
-
label="Detailed Run Data",
|
| 120 |
-
wrap=True,
|
| 121 |
-
height=400
|
| 122 |
-
)
|
| 123 |
-
with gr.Accordion("Raw JSON Output", open=False):
|
| 124 |
-
raw_json_output = gr.JSON()
|
| 125 |
-
|
| 126 |
-
run_btn.click(
|
| 127 |
-
fn=run_experiment_and_display,
|
| 128 |
-
inputs=[model_id_input, prompt_type_input, seed_input, concepts_input, strength_levels_input, num_steps_input, temperature_input],
|
| 129 |
-
outputs=[summary_output, details_output, raw_json_output]
|
| 130 |
)
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
)
|
| 138 |
-
with gr.Row(variant='compact'):
|
| 139 |
-
diag_model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID")
|
| 140 |
-
diag_seed = gr.Slider(1, 1000, 42, step=1, label="Seed")
|
| 141 |
-
diag_btn = gr.Button("Run Diagnostic Suite", variant="secondary")
|
| 142 |
-
diag_output = gr.Markdown(label="Diagnostic Results")
|
| 143 |
-
diag_btn.click(fn=run_diagnostics_display, inputs=[diag_model_id, diag_seed], outputs=[diag_output])
|
| 144 |
|
|
|
|
| 145 |
if __name__ == "__main__":
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import traceback
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
# Wichtige Imports für die neuen Pre-Flight Checks
|
| 7 |
+
from cognitive_mapping_probe.pre_flight_checks import run_pre_flight_checks
|
| 8 |
from cognitive_mapping_probe.orchestrator import run_cognitive_titration_experiment
|
|
|
|
| 9 |
from cognitive_mapping_probe.prompts import RESONANCE_PROMPTS
|
| 10 |
+
from cognitive_mapping_probe.utils import dbg
|
| 11 |
|
| 12 |
# --- UI Theme and Layout ---
|
| 13 |
theme = gr.themes.Soft(primary_hue="orange", secondary_hue="amber").set(
|
|
|
|
| 19 |
button_primary_text_color="white",
|
| 20 |
)
|
| 21 |
|
| 22 |
+
# --- Standard-Modell-ID für Tests und UI ---
|
| 23 |
+
DEFAULT_MODEL_ID = "google/gemma-3-1b-it"
|
| 24 |
+
|
| 25 |
# --- Wrapper Functions for Gradio ---
|
| 26 |
|
| 27 |
def run_experiment_and_display(
|
|
|
|
| 49 |
if not all_runs:
|
| 50 |
return "### ⚠️ No Data Generated\nDas Experiment lief durch, aber es wurden keine Datenpunkte erzeugt. Bitte Logs prüfen.", pd.DataFrame(), results
|
| 51 |
|
|
|
|
| 52 |
details_df = pd.DataFrame(all_runs)
|
|
|
|
|
|
|
| 53 |
summary_text = "### 💥 Cognitive Breaking Points (CBP)\n"
|
| 54 |
summary_text += "Der CBP ist die erste Stärke, bei der das Modell nicht mehr konvergiert (`max_steps_reached`).\n\n"
|
| 55 |
|
| 56 |
+
baseline_run = details_df[details_df['strength'] == 0.0].iloc[0]
|
|
|
|
| 57 |
if baseline_run['termination_reason'] != 'converged':
|
| 58 |
summary_text += f"**‼️ ACHTUNG: Baseline (Stärke 0.0) ist nicht konvergiert!**\n"
|
| 59 |
+
summary_text += f"Der gewählte Prompt (`{prompt_type}`) ist für dieses Modell zu anspruchsvoll. Die Ergebnisse sind nicht aussagekräftig.\n\n"
|
| 60 |
|
| 61 |
for concept in details_df['concept'].unique():
|
| 62 |
concept_df = details_df[details_df['concept'] == concept].sort_values(by='strength')
|
|
|
|
| 63 |
breaking_point_row = concept_df[concept_df['termination_reason'] != 'converged'].iloc[0] if not concept_df[concept_df['termination_reason'] != 'converged'].empty else None
|
| 64 |
if breaking_point_row is not None:
|
| 65 |
+
summary_text += f"- **'{concept}'**: 📉 Kollaps bei Stärke **{breaking_point_row['strength']:.2f}**\n"
|
|
|
|
| 66 |
else:
|
| 67 |
+
summary_text += f"- **'{concept}'**: ✅ Stabil bis Stärke **{concept_df['strength'].max():.2f}**\n"
|
|
|
|
| 68 |
|
| 69 |
return summary_text, details_df, results
|
| 70 |
|
|
|
|
| 72 |
error_str = traceback.format_exc()
|
| 73 |
return f"### ❌ Experiment Failed\nEin unerwarteter Fehler ist aufgetreten:\n\n```\n{error_str}\n```", pd.DataFrame(), {}
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# --- Gradio App Definition ---
|
| 76 |
with gr.Blocks(theme=theme, title="Cognitive Breaking Point Probe") as demo:
|
| 77 |
gr.Markdown("# 💥 Cognitive Breaking Point Probe")
|
| 78 |
|
| 79 |
+
# Der Diagnostics Tab wurde entfernt. Die UI ist jetzt nur noch das Hauptexperiment.
|
| 80 |
+
gr.Markdown(
|
| 81 |
+
"Misst den 'Cognitive Breaking Point' (CBP) – die Injektionsstärke, bei der der Denkprozess eines LLMs von Konvergenz zu einer Endlosschleife kippt."
|
| 82 |
+
)
|
| 83 |
+
with gr.Row(variant='panel'):
|
| 84 |
+
with gr.Column(scale=1):
|
| 85 |
+
gr.Markdown("### Parameters")
|
| 86 |
+
model_id_input = gr.Textbox(value=DEFAULT_MODEL_ID, label="Model ID")
|
| 87 |
+
prompt_type_input = gr.Radio(
|
| 88 |
+
choices=list(RESONANCE_PROMPTS.keys()),
|
| 89 |
+
value="control_long_prose",
|
| 90 |
+
label="Prompt Type (Cognitive Load)",
|
| 91 |
+
info="Beginne mit 'control_long_prose' für eine stabile Baseline!"
|
| 92 |
)
|
| 93 |
+
seed_input = gr.Slider(1, 1000, 42, step=1, label="Global Seed")
|
| 94 |
+
concepts_input = gr.Textbox(value="apple, solitude, fear", label="Concepts (comma-separated)")
|
| 95 |
+
strength_levels_input = gr.Textbox(value="0.0, 0.5, 1.0, 1.5, 2.0", label="Injection Strengths")
|
| 96 |
+
num_steps_input = gr.Slider(50, 500, 250, step=10, label="Max. Internal Steps")
|
| 97 |
+
temperature_input = gr.Slider(0.01, 1.5, 0.7, step=0.01, label="Temperature")
|
| 98 |
+
run_btn = gr.Button("Run Cognitive Titration", variant="primary")
|
| 99 |
+
|
| 100 |
+
with gr.Column(scale=2):
|
| 101 |
+
gr.Markdown("### Results")
|
| 102 |
+
summary_output = gr.Markdown("Zusammenfassung der Breaking Points erscheint hier.", label="Key Findings Summary")
|
| 103 |
+
details_output = gr.DataFrame(
|
| 104 |
+
headers=["concept", "strength", "responded", "termination_reason", "generated_text"],
|
| 105 |
+
label="Detailed Run Data",
|
| 106 |
+
wrap=True,
|
| 107 |
+
height=400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
)
|
| 109 |
+
with gr.Accordion("Raw JSON Output", open=False):
|
| 110 |
+
raw_json_output = gr.JSON()
|
| 111 |
|
| 112 |
+
run_btn.click(
|
| 113 |
+
fn=run_experiment_and_display,
|
| 114 |
+
inputs=[model_id_input, prompt_type_input, seed_input, concepts_input, strength_levels_input, num_steps_input, temperature_input],
|
| 115 |
+
outputs=[summary_output, details_output, raw_json_output]
|
| 116 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
# --- Main Execution Block ---
|
| 119 |
if __name__ == "__main__":
|
| 120 |
+
print("="*80)
|
| 121 |
+
print("🔬 RUNNING PRE-FLIGHT DIAGNOSTICS FOR EXPERIMENTAL APPARATUS")
|
| 122 |
+
print("="*80)
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
# Führe die obligatorischen Systemtests mit einem echten Modell durch.
|
| 126 |
+
# Wenn hier ein Fehler auftritt, ist das Experiment nicht valide.
|
| 127 |
+
run_pre_flight_checks(model_id=DEFAULT_MODEL_ID, seed=42)
|
| 128 |
+
|
| 129 |
+
print("\n" + "="*80)
|
| 130 |
+
print("✅ ALL DIAGNOSTICS PASSED. LAUNCHING GRADIO APP...")
|
| 131 |
+
print("="*80)
|
| 132 |
+
|
| 133 |
+
# Starte die Gradio App nur bei Erfolg.
|
| 134 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|
| 135 |
+
|
| 136 |
+
except (AssertionError, Exception) as e:
|
| 137 |
+
print("\n" + "="*80)
|
| 138 |
+
print("❌ PRE-FLIGHT DIAGNOSTIC FAILED")
|
| 139 |
+
print("="*80)
|
| 140 |
+
print(f"Error Type: {type(e).__name__}")
|
| 141 |
+
print(f"Error Details: {e}")
|
| 142 |
+
print("\nDie experimentelle Apparatur funktioniert nicht wie erwartet.")
|
| 143 |
+
print("Die Gradio-App wird nicht gestartet, um fehlerhafte Messungen zu verhindern.")
|
| 144 |
+
traceback.print_exc()
|
| 145 |
+
sys.exit(1) # Beende das Programm mit einem Fehlercode.
|
cognitive_mapping_probe/diagnostics.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import traceback
|
| 3 |
-
from .llm_iface import get_or_load_model
|
| 4 |
-
from .utils import dbg
|
| 5 |
-
|
| 6 |
-
def run_diagnostic_suite(model_id: str, seed: int) -> str:
|
| 7 |
-
"""
|
| 8 |
-
Führt eine Reihe von Selbsttests durch, um die mechanische Integrität des Experiments zu überprüfen.
|
| 9 |
-
Löst bei einem kritischen Fehler eine Exception aus, um die Ausführung zu stoppen.
|
| 10 |
-
"""
|
| 11 |
-
dbg("--- STARTING DIAGNOSTIC SUITE ---")
|
| 12 |
-
results = []
|
| 13 |
-
|
| 14 |
-
try:
|
| 15 |
-
# --- Setup ---
|
| 16 |
-
dbg("Loading model for diagnostics...")
|
| 17 |
-
llm = get_or_load_model(model_id, seed)
|
| 18 |
-
test_prompt = "Hello world"
|
| 19 |
-
inputs = llm.tokenizer(test_prompt, return_tensors="pt").to(llm.model.device)
|
| 20 |
-
|
| 21 |
-
# --- Test 1: Attention Output Verification ---
|
| 22 |
-
dbg("Running Test 1: Attention Output Verification...")
|
| 23 |
-
# This test ensures that 'eager' attention implementation is active, which is
|
| 24 |
-
# necessary for reliable hook functionality in many transformers versions.
|
| 25 |
-
outputs = llm.model(**inputs, output_attentions=True)
|
| 26 |
-
assert outputs.attentions is not None, "FAIL: `outputs.attentions` is None. 'eager' implementation is likely not active."
|
| 27 |
-
assert isinstance(outputs.attentions, tuple), "FAIL: `outputs.attentions` is not a tuple."
|
| 28 |
-
assert len(outputs.attentions) == llm.config.num_hidden_layers, "FAIL: Number of attention tuples does not match number of layers."
|
| 29 |
-
results.append("✅ Test 1: Attention Output PASSED")
|
| 30 |
-
dbg("Test 1 PASSED.")
|
| 31 |
-
|
| 32 |
-
# --- Test 2: Hook Causal Efficacy ---
|
| 33 |
-
dbg("Running Test 2: Hook Causal Efficacy Verification...")
|
| 34 |
-
# This is the most critical test. It verifies that our injection mechanism (via hooks)
|
| 35 |
-
# has a real, causal effect on the model's computation.
|
| 36 |
-
|
| 37 |
-
# Run 1: Get the baseline hidden state without any intervention
|
| 38 |
-
outputs_no_hook = llm.model(**inputs, output_hidden_states=True)
|
| 39 |
-
target_layer_idx = llm.config.num_hidden_layers // 2
|
| 40 |
-
state_no_hook = outputs_no_hook.hidden_states[target_layer_idx + 1].clone()
|
| 41 |
-
|
| 42 |
-
# Define a simple hook that adds a large, constant value
|
| 43 |
-
injection_value = 42.0
|
| 44 |
-
def test_hook_fn(module, layer_input):
|
| 45 |
-
modified_input = layer_input[0] + injection_value
|
| 46 |
-
return (modified_input,) + layer_input[1:]
|
| 47 |
-
|
| 48 |
-
target_layer = llm.model.model.layers[target_layer_idx]
|
| 49 |
-
handle = target_layer.register_forward_pre_hook(test_hook_fn)
|
| 50 |
-
|
| 51 |
-
# Run 2: Get the hidden state with the hook active
|
| 52 |
-
outputs_with_hook = llm.model(**inputs, output_hidden_states=True)
|
| 53 |
-
state_with_hook = outputs_with_hook.hidden_states[target_layer_idx + 1].clone()
|
| 54 |
-
|
| 55 |
-
handle.remove() # Clean up the hook immediately
|
| 56 |
-
|
| 57 |
-
# The core assertion: the hook MUST change the subsequent hidden state.
|
| 58 |
-
assert not torch.allclose(state_no_hook, state_with_hook), \
|
| 59 |
-
"FAIL: Hook had no measurable effect on the subsequent layer's hidden state. Injections are not working."
|
| 60 |
-
results.append("✅ Test 2: Hook Causal Efficacy PASSED")
|
| 61 |
-
dbg("Test 2 PASSED.")
|
| 62 |
-
|
| 63 |
-
# --- Test 3: KV-Cache Integrity ---
|
| 64 |
-
dbg("Running Test 3: KV-Cache Integrity Verification...")
|
| 65 |
-
# This test ensures that the `past_key_values` are being passed and updated correctly,
|
| 66 |
-
# which is the core mechanic of the silent cogitation loop.
|
| 67 |
-
|
| 68 |
-
# Step 1: Initial pass with `use_cache=True`
|
| 69 |
-
outputs1 = llm.model(**inputs, use_cache=True)
|
| 70 |
-
kv_cache1 = outputs1.past_key_values
|
| 71 |
-
assert kv_cache1 is not None, "FAIL: KV-Cache was not generated in the first pass."
|
| 72 |
-
|
| 73 |
-
# Step 2: Second pass using the cache from step 1
|
| 74 |
-
next_token = torch.tensor([[123]], device=llm.model.device) # Arbitrary next token ID
|
| 75 |
-
outputs2 = llm.model(input_ids=next_token, past_key_values=kv_cache1, use_cache=True)
|
| 76 |
-
kv_cache2 = outputs2.past_key_values
|
| 77 |
-
|
| 78 |
-
original_seq_len = inputs.input_ids.shape[-1]
|
| 79 |
-
# The sequence length of the keys/values in the cache should have grown by 1
|
| 80 |
-
assert kv_cache2[0][0].shape[-2] == original_seq_len + 1, \
|
| 81 |
-
f"FAIL: KV-Cache sequence length did not update correctly. Expected {original_seq_len + 1}, got {kv_cache2[0][0].shape[-2]}."
|
| 82 |
-
results.append("✅ Test 3: KV-Cache Integrity PASSED")
|
| 83 |
-
dbg("Test 3 PASSED.")
|
| 84 |
-
|
| 85 |
-
# Clean up memory
|
| 86 |
-
del llm
|
| 87 |
-
if torch.cuda.is_available():
|
| 88 |
-
torch.cuda.empty_cache()
|
| 89 |
-
|
| 90 |
-
return "\n".join(results)
|
| 91 |
-
|
| 92 |
-
except Exception as e:
|
| 93 |
-
dbg(f"--- DIAGNOSTIC SUITE FAILED --- \n{traceback.format_exc()}")
|
| 94 |
-
# Re-raise the exception to be caught by the Gradio UI
|
| 95 |
-
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cognitive_mapping_probe/pre_flight_checks.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import traceback
|
| 3 |
+
from types import SimpleNamespace
|
| 4 |
+
|
| 5 |
+
from .llm_iface import get_or_load_model
|
| 6 |
+
from .concepts import get_concept_vector
|
| 7 |
+
from .resonance import run_silent_cogitation
|
| 8 |
+
from .verification import generate_spontaneous_text
|
| 9 |
+
from .utils import dbg
|
| 10 |
+
|
| 11 |
+
def run_pre_flight_checks(model_id: str, seed: int):
|
| 12 |
+
"""
|
| 13 |
+
Führt eine Reihe von kritischen Integrationstests mit einem ECHTEN LLM durch,
|
| 14 |
+
um die Validität der gesamten experimentellen Kette sicherzustellen, bevor
|
| 15 |
+
zeitaufwändige Experimente gestartet werden. Löst bei Fehlern einen AssertionError aus.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
print(f"1. Loading model '{model_id}'...")
|
| 19 |
+
try:
|
| 20 |
+
llm = get_or_load_model(model_id, seed)
|
| 21 |
+
print(" ✅ Model loaded successfully.")
|
| 22 |
+
except Exception as e:
|
| 23 |
+
raise AssertionError(f"Model loading failed: {e}")
|
| 24 |
+
|
| 25 |
+
print("\n2. Testing basic text generation...")
|
| 26 |
+
# Dieser einfache Test fängt Tokenizer-, Chat-Template- und grundlegende I/O-Probleme ab.
|
| 27 |
+
try:
|
| 28 |
+
# Erzeuge einen Dummy-Prompt, um eine einfache Antwort zu provozieren
|
| 29 |
+
inputs = llm.tokenizer("Hello, are you working?", return_tensors="pt").to(llm.model.device)
|
| 30 |
+
outputs = llm.model.generate(inputs.input_ids, max_new_tokens=5)
|
| 31 |
+
text = llm.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 32 |
+
assert isinstance(text, str) and len(text) > 0
|
| 33 |
+
print(f" ✅ Basic generation successful. Model responded.")
|
| 34 |
+
dbg(f"Response snippet: '{text[:50]}...'")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
raise AssertionError(f"Basic text generation failed: {e}")
|
| 37 |
+
|
| 38 |
+
print("\n3. Testing concept vector extraction...")
|
| 39 |
+
try:
|
| 40 |
+
vector = get_concept_vector(llm, "test")
|
| 41 |
+
assert vector.shape == (llm.config.hidden_size,)
|
| 42 |
+
print(" ✅ Concept vector extraction successful.")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
raise AssertionError(f"Concept vector extraction failed: {e}")
|
| 45 |
+
|
| 46 |
+
print("\n4. Testing resonance loop (short run)...")
|
| 47 |
+
try:
|
| 48 |
+
_, _, _, reason = run_silent_cogitation(llm, "control_long_prose", num_steps=5, temperature=0.1)
|
| 49 |
+
assert reason in ["converged", "max_steps_reached"]
|
| 50 |
+
print(" ✅ Resonance loop executed without errors.")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
raise AssertionError(f"Resonance loop failed: {e}")
|
| 53 |
+
|
| 54 |
+
print("\n5. CRITICAL TEST: Hook causal efficacy...")
|
| 55 |
+
# Dies ist der wichtigste Test. Er stellt sicher, dass unsere Aktivations-Injektionen
|
| 56 |
+
# tatsächlich eine kausale Wirkung auf die Berechnungen des Modells haben.
|
| 57 |
+
handle = None
|
| 58 |
+
try:
|
| 59 |
+
inputs = llm.tokenizer("Test", return_tensors="pt").to(llm.model.device)
|
| 60 |
+
|
| 61 |
+
# Lauf 1: Ohne Hook, um den Originalzustand zu erhalten
|
| 62 |
+
outputs_no_hook = llm.model(**inputs, output_hidden_states=True)
|
| 63 |
+
target_layer_idx = llm.config.num_hidden_layers // 2
|
| 64 |
+
state_no_hook = outputs_no_hook.hidden_states[target_layer_idx + 1].clone().detach()
|
| 65 |
+
|
| 66 |
+
# Definiere einen einfachen, starken Hook
|
| 67 |
+
def test_hook(module, layer_input):
|
| 68 |
+
return (layer_input[0] + 99.0,) + layer_input[1:]
|
| 69 |
+
|
| 70 |
+
target_layer = llm.model.model.layers[target_layer_idx]
|
| 71 |
+
handle = target_layer.register_forward_pre_hook(test_hook)
|
| 72 |
+
|
| 73 |
+
# Lauf 2: Mit Hook
|
| 74 |
+
outputs_with_hook = llm.model(**inputs, output_hidden_states=True)
|
| 75 |
+
state_with_hook = outputs_with_hook.hidden_states[target_layer_idx + 1].clone().detach()
|
| 76 |
+
|
| 77 |
+
handle.remove() # Hook sofort entfernen
|
| 78 |
+
handle = None
|
| 79 |
+
|
| 80 |
+
# Die entscheidende Behauptung: Die Zustände DÜRFEN NICHT identisch sein.
|
| 81 |
+
assert not torch.allclose(state_no_hook, state_with_hook), \
|
| 82 |
+
"Hook had no causal effect on subsequent hidden states. The injection mechanism is broken."
|
| 83 |
+
|
| 84 |
+
print(" ✅ Hook causal efficacy verified.")
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
raise AssertionError(f"Hook efficacy test failed: {e}")
|
| 88 |
+
finally:
|
| 89 |
+
# Stelle sicher, dass der Hook in jedem Fall entfernt wird
|
| 90 |
+
if handle:
|
| 91 |
+
handle.remove()
|
| 92 |
+
print(" ⚠️ Hook handle was removed during exception handling.")
|
| 93 |
+
|
| 94 |
+
print("\n6. Testing verification (spontaneous text) loop...")
|
| 95 |
+
try:
|
| 96 |
+
# Erstelle Dummy-Daten, um die Funktion isoliert zu testen
|
| 97 |
+
dummy_state = torch.randn(1, 1, llm.config.hidden_size).to(llm.model.device)
|
| 98 |
+
dummy_kv = tuple(
|
| 99 |
+
(torch.randn(1, llm.config.num_attention_heads, 10, llm.config.hidden_size // llm.config.num_attention_heads).to(llm.model.device),
|
| 100 |
+
torch.randn(1, llm.config.num_attention_heads, 10, llm.config.hidden_size // llm.config.num_attention_heads).to(llm.model.device))
|
| 101 |
+
for _ in range(llm.config.num_hidden_layers)
|
| 102 |
+
)
|
| 103 |
+
text = generate_spontaneous_text(llm, dummy_state, dummy_kv, max_new_tokens=5)
|
| 104 |
+
assert isinstance(text, str)
|
| 105 |
+
print(" ✅ Spontaneous text generation loop executed without errors.")
|
| 106 |
+
except Exception as e:
|
| 107 |
+
raise AssertionError(f"Verification loop failed: {e}")
|
| 108 |
+
|
| 109 |
+
# Aufräumen
|
| 110 |
+
del llm
|
| 111 |
+
if torch.cuda.is_available():
|
| 112 |
+
torch.cuda.empty_cache()
|
requirements.txt
CHANGED
|
@@ -6,3 +6,5 @@ pandas>=2.0.0
|
|
| 6 |
scikit-learn>=1.3.0
|
| 7 |
einops>=0.7.0
|
| 8 |
tqdm>=4.66.0
|
|
|
|
|
|
|
|
|
| 6 |
scikit-learn>=1.3.0
|
| 7 |
einops>=0.7.0
|
| 8 |
tqdm>=4.66.0
|
| 9 |
+
pytest>=8.0.0
|
| 10 |
+
pytest-mock>=3.12.0
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import torch
|
| 3 |
+
from types import SimpleNamespace
|
| 4 |
+
from cognitive_mapping_probe.llm_iface import LLM
|
| 5 |
+
|
| 6 |
+
@pytest.fixture(scope="session")
|
| 7 |
+
def mock_llm_config():
|
| 8 |
+
"""Stellt eine minimale, Schein-Konfiguration für das LLM bereit."""
|
| 9 |
+
return SimpleNamespace(
|
| 10 |
+
hidden_size=128, # Kleinere Größe für schnelle Tests
|
| 11 |
+
num_hidden_layers=4,
|
| 12 |
+
num_attention_heads=4
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
@pytest.fixture
|
| 16 |
+
def mock_llm(mocker, mock_llm_config):
|
| 17 |
+
"""
|
| 18 |
+
Dies ist die wichtigste Fixture. Sie erstellt einen "Mock-LLM".
|
| 19 |
+
Anstatt ein echtes Modell von Hugging Face zu laden (was langsam ist und eine GPU erfordert),
|
| 20 |
+
simulieren wir sein Verhalten. Wir verwenden `mocker.patch`, um die echte Ladefunktion
|
| 21 |
+
`get_or_load_model` abzufangen und stattdessen unsere schnelle Mock-Instanz zurückzugeben.
|
| 22 |
+
"""
|
| 23 |
+
# Erstelle eine Schein-Tokenizer-Instanz
|
| 24 |
+
mock_tokenizer = mocker.MagicMock()
|
| 25 |
+
mock_tokenizer.eos_token_id = 1
|
| 26 |
+
mock_tokenizer.decode.return_value = "mocked text"
|
| 27 |
+
|
| 28 |
+
# Erstelle ein Schein-Modell-Objekt
|
| 29 |
+
mock_model = mocker.MagicMock()
|
| 30 |
+
|
| 31 |
+
# Konfiguriere das Schein-Modell so, dass es auf Aufrufe mit plausiblen Tensoren antwortet
|
| 32 |
+
def mock_model_forward(*args, **kwargs):
|
| 33 |
+
batch_size = 1
|
| 34 |
+
seq_len = kwargs.get("input_ids", torch.tensor([[0]])).shape[1]
|
| 35 |
+
|
| 36 |
+
# Simuliere die Ausgaben eines echten Transformer-Modells
|
| 37 |
+
mock_outputs = {
|
| 38 |
+
"hidden_states": tuple(
|
| 39 |
+
[torch.randn(batch_size, seq_len, mock_llm_config.hidden_size) for _ in range(mock_llm_config.num_hidden_layers + 1)]
|
| 40 |
+
),
|
| 41 |
+
"past_key_values": tuple(
|
| 42 |
+
[
|
| 43 |
+
(torch.randn(batch_size, mock_llm_config.num_attention_heads, seq_len, 16),
|
| 44 |
+
torch.randn(batch_size, mock_llm_config.num_attention_heads, seq_len, 16))
|
| 45 |
+
for _ in range(mock_llm_config.num_hidden_layers)
|
| 46 |
+
]
|
| 47 |
+
),
|
| 48 |
+
"logits": torch.randn(batch_size, seq_len, 32000) # Schein-Vokabulargröße
|
| 49 |
+
}
|
| 50 |
+
return SimpleNamespace(**mock_outputs)
|
| 51 |
+
|
| 52 |
+
mock_model.return_value = mock_model_forward
|
| 53 |
+
mock_model.config = mock_llm_config
|
| 54 |
+
mock_model.device = 'cpu'
|
| 55 |
+
|
| 56 |
+
# Erstelle eine Instanz unserer LLM-Klasse, aber mit den gemockten Komponenten
|
| 57 |
+
llm_instance = LLM.__new__(LLM)
|
| 58 |
+
llm_instance.model = mock_model
|
| 59 |
+
llm_instance.tokenizer = mock_tokenizer
|
| 60 |
+
llm_instance.config = mock_llm_config
|
| 61 |
+
llm_instance.seed = 42
|
| 62 |
+
llm_instance.set_all_seeds = mocker.MagicMock() # Mocke die Seeding-Funktion
|
| 63 |
+
|
| 64 |
+
# Der entscheidende Schritt: Patch die Ladefunktion, damit jeder Code, der sie aufruft,
|
| 65 |
+
# stattdessen unsere Mock-Instanz erhält.
|
| 66 |
+
mocker.patch('cognitive_mapping_probe.llm_iface.get_or_load_model', return_value=llm_instance)
|
| 67 |
+
mocker.patch('cognitive_mapping_probe.orchestrator.get_or_load_model', return_value=llm_instance)
|
| 68 |
+
mocker.patch('cognitive_mapping_probe.concepts.get_or_load_model', return_value=llm_instance)
|
| 69 |
+
|
| 70 |
+
return llm_instance
|
tests/test_core_logic.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import pytest
|
| 3 |
+
from types import SimpleNamespace
|
| 4 |
+
|
| 5 |
+
from cognitive_mapping_probe.concepts import get_concept_vector
|
| 6 |
+
from cognitive_mapping_probe.resonance import run_silent_cogitation
|
| 7 |
+
from cognitive_mapping_probe.verification import generate_spontaneous_text
|
| 8 |
+
from cognitive_mapping_probe.orchestrator import run_cognitive_titration_experiment
|
| 9 |
+
|
| 10 |
+
def test_get_concept_vector(mock_llm):
|
| 11 |
+
"""
|
| 12 |
+
Testet die `get_concept_vector` Funktion.
|
| 13 |
+
ASSERT: Gibt einen Tensor der korrekten Form zurück, der nicht null ist.
|
| 14 |
+
"""
|
| 15 |
+
concept_vector = get_concept_vector(mock_llm, "test_concept")
|
| 16 |
+
|
| 17 |
+
assert isinstance(concept_vector, torch.Tensor)
|
| 18 |
+
assert concept_vector.shape == (mock_llm.config.hidden_size,)
|
| 19 |
+
assert torch.norm(concept_vector).item() > 0
|
| 20 |
+
|
| 21 |
+
def test_run_silent_cogitation_max_steps(mock_llm):
|
| 22 |
+
"""
|
| 23 |
+
Testet `run_silent_cogitation` im Standardfall (erreicht `max_steps`).
|
| 24 |
+
ASSERT: Läuft fehlerfrei durch und gibt 'max_steps_reached' zurück.
|
| 25 |
+
"""
|
| 26 |
+
_, _, _, termination_reason = run_silent_cogitation(
|
| 27 |
+
llm=mock_llm,
|
| 28 |
+
prompt_type="resonance_prompt",
|
| 29 |
+
num_steps=10,
|
| 30 |
+
temperature=0.7
|
| 31 |
+
)
|
| 32 |
+
assert termination_reason == "max_steps_reached"
|
| 33 |
+
|
| 34 |
+
def test_run_silent_cogitation_convergence(mock_llm, mocker):
|
| 35 |
+
"""
|
| 36 |
+
Testet den Konvergenzfall in `run_silent_cogitation`.
|
| 37 |
+
Wir patchen hier die `lm_head`-Ausgabe des Modells so, dass sie nach wenigen Schritten
|
| 38 |
+
einen stabilen Zustand erzwingt.
|
| 39 |
+
ASSERT: Die Funktion erkennt die Konvergenz korrekt und gibt 'converged' zurück.
|
| 40 |
+
"""
|
| 41 |
+
stable_hidden_state = torch.ones(1, 1, mock_llm.config.hidden_size)
|
| 42 |
+
|
| 43 |
+
# Mocke die Modell-Ausgaben, um Konvergenz zu simulieren
|
| 44 |
+
def convergence_side_effect(*args, **kwargs):
|
| 45 |
+
# Nach dem 5. Schritt geben wir immer denselben Zustand zurück
|
| 46 |
+
if 'past_key_values' in kwargs and kwargs['past_key_values'][0][0].shape[-2] > 5:
|
| 47 |
+
return SimpleNamespace(
|
| 48 |
+
hidden_states=(None,) * (mock_llm.config.num_hidden_layers + 1) + (stable_hidden_state,),
|
| 49 |
+
past_key_values=kwargs['past_key_values'],
|
| 50 |
+
logits=torch.randn(1, 1, 32000)
|
| 51 |
+
)
|
| 52 |
+
# Ansonsten normales Verhalten
|
| 53 |
+
return mock_llm.model.return_value(*args, **kwargs)
|
| 54 |
+
|
| 55 |
+
mock_llm.model.side_effect = convergence_side_effect
|
| 56 |
+
|
| 57 |
+
_, _, _, termination_reason = run_silent_cogitation(
|
| 58 |
+
llm=mock_llm,
|
| 59 |
+
prompt_type="resonance_prompt",
|
| 60 |
+
num_steps=20, # Mehr Schritte als nötig, um Konvergenz zu testen
|
| 61 |
+
temperature=0.7
|
| 62 |
+
)
|
| 63 |
+
assert termination_reason == "converged"
|
| 64 |
+
|
| 65 |
+
def test_generate_spontaneous_text(mock_llm):
|
| 66 |
+
"""
|
| 67 |
+
Testet die manuelle Textgenerierung.
|
| 68 |
+
ASSERT: Generiert einen nicht-leeren String.
|
| 69 |
+
"""
|
| 70 |
+
# Erstelle plausible Schein-Eingaben
|
| 71 |
+
dummy_state = torch.randn(1, 1, mock_llm.config.hidden_size)
|
| 72 |
+
dummy_kv_cache = tuple(
|
| 73 |
+
[
|
| 74 |
+
(torch.randn(1, mock_llm.config.num_attention_heads, 10, 16),
|
| 75 |
+
torch.randn(1, mock_llm.config.num_attention_heads, 10, 16))
|
| 76 |
+
for _ in range(mock_llm.config.num_hidden_layers)
|
| 77 |
+
]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
text = generate_spontaneous_text(mock_llm, dummy_state, dummy_kv_cache, max_new_tokens=5)
|
| 81 |
+
|
| 82 |
+
assert isinstance(text, str)
|
| 83 |
+
assert text == "mocked text" # Unser Mock-Tokenizer gibt immer diesen Text zurück
|
| 84 |
+
|
| 85 |
+
def test_orchestrator_logic(mocker, mock_llm):
|
| 86 |
+
"""
|
| 87 |
+
Dies ist ein Integrationstest für den Orchestrator. Wir testen seine *Logik*,
|
| 88 |
+
indem wir seine teuren Abhängigkeiten (`run_silent_cogitation`) mocken.
|
| 89 |
+
Wir simulieren ein Szenario, in dem das Modell bei Stärke >= 1.5 "bricht".
|
| 90 |
+
ASSERT: Der Orchestrator zeichnet die Ergebnisse korrekt auf.
|
| 91 |
+
"""
|
| 92 |
+
# Mocke die `run_silent_cogitation`, um ihr Verhalten zu kontrollieren
|
| 93 |
+
def cognition_side_effect(*args, injection_strength=0.0, **kwargs):
|
| 94 |
+
reason = "converged" if injection_strength < 1.5 else "max_steps_reached"
|
| 95 |
+
# Gebe Schein-Tensoren zurück, die die richtige Struktur haben
|
| 96 |
+
return (
|
| 97 |
+
torch.randn(1, 1, mock_llm.config.hidden_size), # final_hidden_state
|
| 98 |
+
mocker.MagicMock(), # final_kv_cache
|
| 99 |
+
torch.tensor([[0]]), # final_token_id
|
| 100 |
+
reason
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
mocker.patch('cognitive_mapping_probe.orchestrator.run_silent_cogitation', side_effect=cognition_side_effect)
|
| 104 |
+
mocker.patch('cognitive_mapping_probe.orchestrator.generate_spontaneous_text', return_value="generated")
|
| 105 |
+
|
| 106 |
+
# Führe das Experiment mit den gemockten Funktionen aus
|
| 107 |
+
results = run_cognitive_titration_experiment(
|
| 108 |
+
model_id="mock_model",
|
| 109 |
+
prompt_type="resonance_prompt",
|
| 110 |
+
seed=42,
|
| 111 |
+
concepts_str="test",
|
| 112 |
+
strength_levels_str="0.0, 1.0, 1.5, 2.0",
|
| 113 |
+
num_steps=10,
|
| 114 |
+
temperature=0.7,
|
| 115 |
+
progress_callback=mocker.MagicMock() # Mocke auch den Progress-Callback
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
runs = results["runs"]
|
| 119 |
+
assert len(runs) == 4
|
| 120 |
+
|
| 121 |
+
# Überprüfe die Ergebnisse basierend auf unserer Mock-Logik
|
| 122 |
+
assert runs[0]['strength'] == 0.0 and runs[0]['termination_reason'] == 'converged' and runs[0]['responded'] is True
|
| 123 |
+
assert runs[1]['strength'] == 1.0 and runs[1]['termination_reason'] == 'converged' and runs[1]['responded'] is True
|
| 124 |
+
assert runs[2]['strength'] == 1.5 and runs[2]['termination_reason'] == 'max_steps_reached' and runs[2]['responded'] is False
|
| 125 |
+
assert runs[3]['strength'] == 2.0 and runs[3]['termination_reason'] == 'max_steps_reached' and runs[3]['responded'] is False
|