Spaces:

Chars
/

CognitiveKernel-Launchpad

Sleeping

File size: 6,768 Bytes

1ea26af

#!/usr/bin/env python3
"""
GAIA Runner - Minimal LLM judge with function calling
Implements 0-5 scoring system with unit/format awareness.
"""

import json
import time
from typing import Tuple, Dict, Any
from ck_pro.core import CognitiveKernel
from ck_pro.agents.model import LLM


def judge_answer(kernel: CognitiveKernel, question: str, answer: str, ground_truth: str) -> Tuple[int, str]:
    """
    Judge model answer against ground truth using LLM with function calling
    
    Args:
        kernel: CognitiveKernel instance (for accessing model config)
        question: Original task question
        answer: Model's answer to evaluate
        ground_truth: Expected correct answer
        
    Returns:
        Tuple of (score: int 0-5, reason: str)
    """
    # Handle edge cases
    if not ground_truth or not ground_truth.strip():
        return 0, 'empty-ground-truth'
    
    if not answer or not str(answer).strip():
        return 0, 'empty-answer'
    
    # Create LLM instance using same config as kernel
    cfg = kernel.settings.ck.model
    judge_llm = LLM(
        call_target=cfg.call_target,
        api_key=cfg.api_key,
        model=cfg.model,
        extract_body=cfg.extract_body.copy()  # Start with base config
    )
    
    # Prepare judge prompt
    system_prompt = (
        "You are a strict evaluator. Use the provided function `grade(score:int, reason:str)` "
        "to score the model answer against the ground truth. Consider units, conversions, "
        "and format requirements carefully."
    )
    
    user_prompt = f"""Task: {question}

Expected Answer (ground truth): {ground_truth}
Model Answer: {answer}

Guidelines:
- Score from 0 to 5 (integers only), where 5 = fully correct and compliant; 0 = wrong/irrelevant.
- Pay special attention to units, numerical conversions, and precision (e.g., 17 thousand hours ≠ 17000 hours if context implies unit mismatch).
- Enforce format requirements explicitly stated in the query (unit, casing, output schema, etc.).
- Penalize partial or ambiguous answers accordingly.

Use the provided function `grade(score:int, reason:str)` to return the result; do NOT output free text."""

    # Function calling schema for grading
    function_schema = {
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "grade",
                    "description": "Return a 0-5 integer score and a brief justification.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "score": {"type": "integer", "minimum": 0, "maximum": 5},
                            "reason": {"type": "string"}
                        },
                        "required": ["score", "reason"]
                    }
                }
            }
        ],
        "tool_choice": {"type": "function", "function": {"name": "grade"}}
    }
    
    # Prepare messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    try:
        # Call LLM with function calling
        response = judge_llm(messages, extract_body=function_schema)
        
        # Parse the response (should be JSON from function call)
        try:
            result = json.loads(response)
            score = int(result.get('score', 0))
            reason = str(result.get('reason', '')).strip()
            
            # Validate score range
            score = max(0, min(5, score))
            
            # Ensure reason is not empty
            if not reason:
                reason = 'llm-judge-no-reason'
                
            return score, reason
            
        except (json.JSONDecodeError, ValueError, TypeError) as e:
            # Fallback: try to extract score from text response
            return _fallback_parse_score(response), f'parse-error: {str(e)}'
            
    except Exception as e:
        # Last resort: exact match fallback
        if str(answer).strip().lower() == str(ground_truth).strip().lower():
            return 5, 'fallback-exact-match'
        return 0, f'judge-failed: {str(e)}'


def _fallback_parse_score(response: str) -> int:
    """
    Fallback parser to extract score from text response
    
    Args:
        response: Raw LLM response text
        
    Returns:
        Extracted score (0-5)
    """
    import re
    
    # Try to find score patterns
    patterns = [
        r'"score":\s*(\d+)',
        r'score:\s*(\d+)',
        r'Score:\s*(\d+)',
        r'(\d+)/5',
        r'(\d+)\s*out\s*of\s*5'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, response)
        if match:
            try:
                score = int(match.group(1))
                return max(0, min(5, score))
            except ValueError:
                continue
    
    # No score found
    return 0


def run_single_task(kernel: CognitiveKernel, task: Dict[str, Any]) -> Dict[str, Any]:
    """
    Run a single task through the complete pipeline
    
    Args:
        kernel: CognitiveKernel instance
        task: Task dictionary from data_loader
        
    Returns:
        Result dictionary with all fields
    """
    start_time = time.time()
    
    try:
        # Execute reasoning via CognitiveKernel
        result = kernel.reason(task['question'])
        
        if not result.success:
            # Fail fast on execution errors
            return {
                **task,
                'answer': None,
                'success': False,
                'error': f'kernel-failed: {getattr(result, "error", "unknown")}',
                'execution_time': time.time() - start_time,
                'reasoning_steps': 0,
                'score': 0,
                'judge_reason': 'execution-failed'
            }
        
        # Judge the answer
        score, judge_reason = judge_answer(
            kernel, 
            task['question'], 
            result.answer, 
            task['ground_truth']
        )
        
        return {
            **task,
            'answer': result.answer,
            'success': True,
            'error': None,
            'execution_time': time.time() - start_time,
            'reasoning_steps': result.reasoning_steps,
            'score': score,
            'judge_reason': judge_reason
        }
        
    except Exception as e:
        return {
            **task,
            'answer': None,
            'success': False,
            'error': f'unexpected-error: {str(e)}',
            'execution_time': time.time() - start_time,
            'reasoning_steps': 0,
            'score': 0,
            'judge_reason': 'execution-failed'
        }