Spaces:
Sleeping
Sleeping
File size: 6,768 Bytes
1ea26af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
#!/usr/bin/env python3
"""
GAIA Runner - Minimal LLM judge with function calling
Implements 0-5 scoring system with unit/format awareness.
"""
import json
import time
from typing import Tuple, Dict, Any
from ck_pro.core import CognitiveKernel
from ck_pro.agents.model import LLM
def judge_answer(kernel: CognitiveKernel, question: str, answer: str, ground_truth: str) -> Tuple[int, str]:
"""
Judge model answer against ground truth using LLM with function calling
Args:
kernel: CognitiveKernel instance (for accessing model config)
question: Original task question
answer: Model's answer to evaluate
ground_truth: Expected correct answer
Returns:
Tuple of (score: int 0-5, reason: str)
"""
# Handle edge cases
if not ground_truth or not ground_truth.strip():
return 0, 'empty-ground-truth'
if not answer or not str(answer).strip():
return 0, 'empty-answer'
# Create LLM instance using same config as kernel
cfg = kernel.settings.ck.model
judge_llm = LLM(
call_target=cfg.call_target,
api_key=cfg.api_key,
model=cfg.model,
extract_body=cfg.extract_body.copy() # Start with base config
)
# Prepare judge prompt
system_prompt = (
"You are a strict evaluator. Use the provided function `grade(score:int, reason:str)` "
"to score the model answer against the ground truth. Consider units, conversions, "
"and format requirements carefully."
)
user_prompt = f"""Task: {question}
Expected Answer (ground truth): {ground_truth}
Model Answer: {answer}
Guidelines:
- Score from 0 to 5 (integers only), where 5 = fully correct and compliant; 0 = wrong/irrelevant.
- Pay special attention to units, numerical conversions, and precision (e.g., 17 thousand hours ≠ 17000 hours if context implies unit mismatch).
- Enforce format requirements explicitly stated in the query (unit, casing, output schema, etc.).
- Penalize partial or ambiguous answers accordingly.
Use the provided function `grade(score:int, reason:str)` to return the result; do NOT output free text."""
# Function calling schema for grading
function_schema = {
"tools": [
{
"type": "function",
"function": {
"name": "grade",
"description": "Return a 0-5 integer score and a brief justification.",
"parameters": {
"type": "object",
"properties": {
"score": {"type": "integer", "minimum": 0, "maximum": 5},
"reason": {"type": "string"}
},
"required": ["score", "reason"]
}
}
}
],
"tool_choice": {"type": "function", "function": {"name": "grade"}}
}
# Prepare messages
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
try:
# Call LLM with function calling
response = judge_llm(messages, extract_body=function_schema)
# Parse the response (should be JSON from function call)
try:
result = json.loads(response)
score = int(result.get('score', 0))
reason = str(result.get('reason', '')).strip()
# Validate score range
score = max(0, min(5, score))
# Ensure reason is not empty
if not reason:
reason = 'llm-judge-no-reason'
return score, reason
except (json.JSONDecodeError, ValueError, TypeError) as e:
# Fallback: try to extract score from text response
return _fallback_parse_score(response), f'parse-error: {str(e)}'
except Exception as e:
# Last resort: exact match fallback
if str(answer).strip().lower() == str(ground_truth).strip().lower():
return 5, 'fallback-exact-match'
return 0, f'judge-failed: {str(e)}'
def _fallback_parse_score(response: str) -> int:
"""
Fallback parser to extract score from text response
Args:
response: Raw LLM response text
Returns:
Extracted score (0-5)
"""
import re
# Try to find score patterns
patterns = [
r'"score":\s*(\d+)',
r'score:\s*(\d+)',
r'Score:\s*(\d+)',
r'(\d+)/5',
r'(\d+)\s*out\s*of\s*5'
]
for pattern in patterns:
match = re.search(pattern, response)
if match:
try:
score = int(match.group(1))
return max(0, min(5, score))
except ValueError:
continue
# No score found
return 0
def run_single_task(kernel: CognitiveKernel, task: Dict[str, Any]) -> Dict[str, Any]:
"""
Run a single task through the complete pipeline
Args:
kernel: CognitiveKernel instance
task: Task dictionary from data_loader
Returns:
Result dictionary with all fields
"""
start_time = time.time()
try:
# Execute reasoning via CognitiveKernel
result = kernel.reason(task['question'])
if not result.success:
# Fail fast on execution errors
return {
**task,
'answer': None,
'success': False,
'error': f'kernel-failed: {getattr(result, "error", "unknown")}',
'execution_time': time.time() - start_time,
'reasoning_steps': 0,
'score': 0,
'judge_reason': 'execution-failed'
}
# Judge the answer
score, judge_reason = judge_answer(
kernel,
task['question'],
result.answer,
task['ground_truth']
)
return {
**task,
'answer': result.answer,
'success': True,
'error': None,
'execution_time': time.time() - start_time,
'reasoning_steps': result.reasoning_steps,
'score': score,
'judge_reason': judge_reason
}
except Exception as e:
return {
**task,
'answer': None,
'success': False,
'error': f'unexpected-error: {str(e)}',
'execution_time': time.time() - start_time,
'reasoning_steps': 0,
'score': 0,
'judge_reason': 'execution-failed'
}
|