File size: 6,768 Bytes
1ea26af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python3
"""
GAIA Runner - Minimal LLM judge with function calling
Implements 0-5 scoring system with unit/format awareness.
"""

import json
import time
from typing import Tuple, Dict, Any
from ck_pro.core import CognitiveKernel
from ck_pro.agents.model import LLM


def judge_answer(kernel: CognitiveKernel, question: str, answer: str, ground_truth: str) -> Tuple[int, str]:
    """
    Judge model answer against ground truth using LLM with function calling
    
    Args:
        kernel: CognitiveKernel instance (for accessing model config)
        question: Original task question
        answer: Model's answer to evaluate
        ground_truth: Expected correct answer
        
    Returns:
        Tuple of (score: int 0-5, reason: str)
    """
    # Handle edge cases
    if not ground_truth or not ground_truth.strip():
        return 0, 'empty-ground-truth'
    
    if not answer or not str(answer).strip():
        return 0, 'empty-answer'
    
    # Create LLM instance using same config as kernel
    cfg = kernel.settings.ck.model
    judge_llm = LLM(
        call_target=cfg.call_target,
        api_key=cfg.api_key,
        model=cfg.model,
        extract_body=cfg.extract_body.copy()  # Start with base config
    )
    
    # Prepare judge prompt
    system_prompt = (
        "You are a strict evaluator. Use the provided function `grade(score:int, reason:str)` "
        "to score the model answer against the ground truth. Consider units, conversions, "
        "and format requirements carefully."
    )
    
    user_prompt = f"""Task: {question}

Expected Answer (ground truth): {ground_truth}
Model Answer: {answer}

Guidelines:
- Score from 0 to 5 (integers only), where 5 = fully correct and compliant; 0 = wrong/irrelevant.
- Pay special attention to units, numerical conversions, and precision (e.g., 17 thousand hours ≠ 17000 hours if context implies unit mismatch).
- Enforce format requirements explicitly stated in the query (unit, casing, output schema, etc.).
- Penalize partial or ambiguous answers accordingly.

Use the provided function `grade(score:int, reason:str)` to return the result; do NOT output free text."""

    # Function calling schema for grading
    function_schema = {
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "grade",
                    "description": "Return a 0-5 integer score and a brief justification.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "score": {"type": "integer", "minimum": 0, "maximum": 5},
                            "reason": {"type": "string"}
                        },
                        "required": ["score", "reason"]
                    }
                }
            }
        ],
        "tool_choice": {"type": "function", "function": {"name": "grade"}}
    }
    
    # Prepare messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    try:
        # Call LLM with function calling
        response = judge_llm(messages, extract_body=function_schema)
        
        # Parse the response (should be JSON from function call)
        try:
            result = json.loads(response)
            score = int(result.get('score', 0))
            reason = str(result.get('reason', '')).strip()
            
            # Validate score range
            score = max(0, min(5, score))
            
            # Ensure reason is not empty
            if not reason:
                reason = 'llm-judge-no-reason'
                
            return score, reason
            
        except (json.JSONDecodeError, ValueError, TypeError) as e:
            # Fallback: try to extract score from text response
            return _fallback_parse_score(response), f'parse-error: {str(e)}'
            
    except Exception as e:
        # Last resort: exact match fallback
        if str(answer).strip().lower() == str(ground_truth).strip().lower():
            return 5, 'fallback-exact-match'
        return 0, f'judge-failed: {str(e)}'


def _fallback_parse_score(response: str) -> int:
    """
    Fallback parser to extract score from text response
    
    Args:
        response: Raw LLM response text
        
    Returns:
        Extracted score (0-5)
    """
    import re
    
    # Try to find score patterns
    patterns = [
        r'"score":\s*(\d+)',
        r'score:\s*(\d+)',
        r'Score:\s*(\d+)',
        r'(\d+)/5',
        r'(\d+)\s*out\s*of\s*5'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, response)
        if match:
            try:
                score = int(match.group(1))
                return max(0, min(5, score))
            except ValueError:
                continue
    
    # No score found
    return 0


def run_single_task(kernel: CognitiveKernel, task: Dict[str, Any]) -> Dict[str, Any]:
    """
    Run a single task through the complete pipeline
    
    Args:
        kernel: CognitiveKernel instance
        task: Task dictionary from data_loader
        
    Returns:
        Result dictionary with all fields
    """
    start_time = time.time()
    
    try:
        # Execute reasoning via CognitiveKernel
        result = kernel.reason(task['question'])
        
        if not result.success:
            # Fail fast on execution errors
            return {
                **task,
                'answer': None,
                'success': False,
                'error': f'kernel-failed: {getattr(result, "error", "unknown")}',
                'execution_time': time.time() - start_time,
                'reasoning_steps': 0,
                'score': 0,
                'judge_reason': 'execution-failed'
            }
        
        # Judge the answer
        score, judge_reason = judge_answer(
            kernel, 
            task['question'], 
            result.answer, 
            task['ground_truth']
        )
        
        return {
            **task,
            'answer': result.answer,
            'success': True,
            'error': None,
            'execution_time': time.time() - start_time,
            'reasoning_steps': result.reasoning_steps,
            'score': score,
            'judge_reason': judge_reason
        }
        
    except Exception as e:
        return {
            **task,
            'answer': None,
            'success': False,
            'error': f'unexpected-error: {str(e)}',
            'execution_time': time.time() - start_time,
            'reasoning_steps': 0,
            'score': 0,
            'judge_reason': 'execution-failed'
        }