charSLee013
feat: complete Hugging Face Spaces deployment with production-ready CognitiveKernel-Launchpad
1ea26af
#!/usr/bin/env python3
"""
GAIA Runner - Minimal LLM judge with function calling
Implements 0-5 scoring system with unit/format awareness.
"""
import json
import time
from typing import Tuple, Dict, Any
from ck_pro.core import CognitiveKernel
from ck_pro.agents.model import LLM
def judge_answer(kernel: CognitiveKernel, question: str, answer: str, ground_truth: str) -> Tuple[int, str]:
"""
Judge model answer against ground truth using LLM with function calling
Args:
kernel: CognitiveKernel instance (for accessing model config)
question: Original task question
answer: Model's answer to evaluate
ground_truth: Expected correct answer
Returns:
Tuple of (score: int 0-5, reason: str)
"""
# Handle edge cases
if not ground_truth or not ground_truth.strip():
return 0, 'empty-ground-truth'
if not answer or not str(answer).strip():
return 0, 'empty-answer'
# Create LLM instance using same config as kernel
cfg = kernel.settings.ck.model
judge_llm = LLM(
call_target=cfg.call_target,
api_key=cfg.api_key,
model=cfg.model,
extract_body=cfg.extract_body.copy() # Start with base config
)
# Prepare judge prompt
system_prompt = (
"You are a strict evaluator. Use the provided function `grade(score:int, reason:str)` "
"to score the model answer against the ground truth. Consider units, conversions, "
"and format requirements carefully."
)
user_prompt = f"""Task: {question}
Expected Answer (ground truth): {ground_truth}
Model Answer: {answer}
Guidelines:
- Score from 0 to 5 (integers only), where 5 = fully correct and compliant; 0 = wrong/irrelevant.
- Pay special attention to units, numerical conversions, and precision (e.g., 17 thousand hours ≠ 17000 hours if context implies unit mismatch).
- Enforce format requirements explicitly stated in the query (unit, casing, output schema, etc.).
- Penalize partial or ambiguous answers accordingly.
Use the provided function `grade(score:int, reason:str)` to return the result; do NOT output free text."""
# Function calling schema for grading
function_schema = {
"tools": [
{
"type": "function",
"function": {
"name": "grade",
"description": "Return a 0-5 integer score and a brief justification.",
"parameters": {
"type": "object",
"properties": {
"score": {"type": "integer", "minimum": 0, "maximum": 5},
"reason": {"type": "string"}
},
"required": ["score", "reason"]
}
}
}
],
"tool_choice": {"type": "function", "function": {"name": "grade"}}
}
# Prepare messages
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
try:
# Call LLM with function calling
response = judge_llm(messages, extract_body=function_schema)
# Parse the response (should be JSON from function call)
try:
result = json.loads(response)
score = int(result.get('score', 0))
reason = str(result.get('reason', '')).strip()
# Validate score range
score = max(0, min(5, score))
# Ensure reason is not empty
if not reason:
reason = 'llm-judge-no-reason'
return score, reason
except (json.JSONDecodeError, ValueError, TypeError) as e:
# Fallback: try to extract score from text response
return _fallback_parse_score(response), f'parse-error: {str(e)}'
except Exception as e:
# Last resort: exact match fallback
if str(answer).strip().lower() == str(ground_truth).strip().lower():
return 5, 'fallback-exact-match'
return 0, f'judge-failed: {str(e)}'
def _fallback_parse_score(response: str) -> int:
"""
Fallback parser to extract score from text response
Args:
response: Raw LLM response text
Returns:
Extracted score (0-5)
"""
import re
# Try to find score patterns
patterns = [
r'"score":\s*(\d+)',
r'score:\s*(\d+)',
r'Score:\s*(\d+)',
r'(\d+)/5',
r'(\d+)\s*out\s*of\s*5'
]
for pattern in patterns:
match = re.search(pattern, response)
if match:
try:
score = int(match.group(1))
return max(0, min(5, score))
except ValueError:
continue
# No score found
return 0
def run_single_task(kernel: CognitiveKernel, task: Dict[str, Any]) -> Dict[str, Any]:
"""
Run a single task through the complete pipeline
Args:
kernel: CognitiveKernel instance
task: Task dictionary from data_loader
Returns:
Result dictionary with all fields
"""
start_time = time.time()
try:
# Execute reasoning via CognitiveKernel
result = kernel.reason(task['question'])
if not result.success:
# Fail fast on execution errors
return {
**task,
'answer': None,
'success': False,
'error': f'kernel-failed: {getattr(result, "error", "unknown")}',
'execution_time': time.time() - start_time,
'reasoning_steps': 0,
'score': 0,
'judge_reason': 'execution-failed'
}
# Judge the answer
score, judge_reason = judge_answer(
kernel,
task['question'],
result.answer,
task['ground_truth']
)
return {
**task,
'answer': result.answer,
'success': True,
'error': None,
'execution_time': time.time() - start_time,
'reasoning_steps': result.reasoning_steps,
'score': score,
'judge_reason': judge_reason
}
except Exception as e:
return {
**task,
'answer': None,
'success': False,
'error': f'unexpected-error: {str(e)}',
'execution_time': time.time() - start_time,
'reasoning_steps': 0,
'score': 0,
'judge_reason': 'execution-failed'
}