Spaces:
Sleeping
Sleeping
charSLee013
feat: complete Hugging Face Spaces deployment with production-ready CognitiveKernel-Launchpad
1ea26af
| #!/usr/bin/env python3 | |
| # NOTICE: This file is adapted from Tencent's CognitiveKernel-Pro (https://github.com/Tencent/CognitiveKernel-Pro). | |
| # Modifications in this fork (2025) are for academic research and educational use only; no commercial use. | |
| # Original rights belong to the original authors and Tencent; see upstream license for details. | |
| """ | |
| GAIA Simple Validator - Minimal CLI for GAIA evaluation | |
| Pipeline: filter → run via CognitiveKernel → LLM judge → write results | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from datetime import datetime | |
| from typing import List, Dict, Any | |
| # Robust imports with fallback to repository root | |
| try: | |
| from ck_pro.core import CognitiveKernel | |
| from ck_pro.config.settings import Settings | |
| except ImportError: | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) | |
| from ck_pro.core import CognitiveKernel | |
| from ck_pro.config.settings import Settings | |
| from gaia.data_loader import load_tasks, filter_tasks, get_task_stats | |
| from gaia.runner import run_single_task | |
| def main(): | |
| """Main CLI entry point""" | |
| parser = argparse.ArgumentParser( | |
| description='GAIA Simple Validator - Minimal evaluation pipeline', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Run all tasks without file attachments | |
| python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml | |
| # Run level 2 tasks only, limit to 50 | |
| python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml --level 2 --count 50 | |
| # Specify output file | |
| python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml --output results.jsonl | |
| """ | |
| ) | |
| parser.add_argument( | |
| '--data', | |
| required=True, | |
| help='Path to GAIA metadata.jsonl file' | |
| ) | |
| parser.add_argument( | |
| '--config', | |
| required=False, | |
| help='Path to TOML configuration file (optional; environment variables supported)' | |
| ) | |
| parser.add_argument( | |
| '--level', | |
| default='all', | |
| choices=['1', '2', '3', 'all'], | |
| help='Filter by difficulty level (default: all)' | |
| ) | |
| parser.add_argument( | |
| '--count', | |
| type=int, | |
| default=0, | |
| help='Maximum number of tasks to run (0 = no limit)' | |
| ) | |
| parser.add_argument( | |
| '--output', | |
| help='Output JSONL file path (default: output/results_YYYYMMDD_HHMMSS.jsonl)' | |
| ) | |
| args = parser.parse_args() | |
| # Load and filter tasks | |
| print(f"Loading tasks from {args.data}...") | |
| try: | |
| all_tasks = load_tasks(args.data) | |
| print(f"Loaded {len(all_tasks)} total tasks") | |
| # Show initial stats | |
| initial_stats = get_task_stats(all_tasks) | |
| print(f"Initial stats: {initial_stats}") | |
| # Filter tasks | |
| tasks = filter_tasks(all_tasks, level=args.level, limit=args.count) | |
| print(f"After filtering: {len(tasks)} tasks (level={args.level}, limit={args.count})") | |
| if not tasks: | |
| print("No tasks to process after filtering. Exiting.") | |
| sys.exit(0) | |
| except Exception as e: | |
| print(f"Error loading data: {e}") | |
| sys.exit(1) | |
| # Initialize CognitiveKernel (support env-only when no --config provided) | |
| try: | |
| if args.config and os.path.exists(args.config): | |
| print(f"Initializing CognitiveKernel from config: {args.config}") | |
| settings = Settings.load(args.config) | |
| else: | |
| print("Initializing CognitiveKernel (no config file); using environment variables if set, otherwise defaults") | |
| settings = Settings.load(args.config or "config.toml") | |
| kernel = CognitiveKernel(settings) | |
| print("CognitiveKernel initialized successfully") | |
| except Exception as e: | |
| print(f"Error initializing CognitiveKernel: {e}") | |
| sys.exit(1) | |
| # Determine output path | |
| output_path = args.output | |
| if not output_path: | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| os.makedirs('output', exist_ok=True) | |
| output_path = os.path.join('output', f'results_{timestamp}.jsonl') | |
| print(f"Results will be written to: {output_path}") | |
| # Process tasks | |
| results = [] | |
| failed_count = 0 | |
| print(f"\nProcessing {len(tasks)} tasks...") | |
| print("=" * 60) | |
| for i, task in enumerate(tasks, 1): | |
| print(f"[{i}/{len(tasks)}] Processing task: {task['task_id']}") | |
| try: | |
| result = run_single_task(kernel, task) | |
| results.append(result) | |
| # Check for execution failure (fail-fast requirement) | |
| if not result['success']: | |
| print(f"FAILED: {result['error']}") | |
| failed_count += 1 | |
| # Fail fast on first kernel failure | |
| print(f"\nFail-fast triggered: Task {task['task_id']} failed execution") | |
| print(f"Error: {result['error']}") | |
| print("Exiting immediately as per fail-fast policy") | |
| sys.exit(1) | |
| else: | |
| print(f"SUCCESS: Score {result['score']}/5 - {result['judge_reason']}") | |
| except KeyboardInterrupt: | |
| print("\nInterrupted by user") | |
| break | |
| except Exception as e: | |
| print(f"UNEXPECTED ERROR: {e}") | |
| failed_count += 1 | |
| # Still fail fast on unexpected errors | |
| sys.exit(1) | |
| # Write results | |
| print(f"\nWriting {len(results)} results to {output_path}") | |
| try: | |
| # Ensure parent directory exists (handles --output with nested paths) | |
| out_dir = os.path.dirname(output_path) | |
| if out_dir: | |
| os.makedirs(out_dir, exist_ok=True) | |
| with open(output_path, 'w+', encoding='utf-8') as f: | |
| for result in results: | |
| f.write(json.dumps(result, ensure_ascii=False) + '\n') | |
| print(f"Results written successfully") | |
| except Exception as e: | |
| print(f"Error writing results: {e}") | |
| sys.exit(1) | |
| # Summary statistics | |
| if results: | |
| successful = [r for r in results if r['success']] | |
| scores = [r['score'] for r in successful] | |
| print("\n" + "=" * 60) | |
| print("SUMMARY") | |
| print("=" * 60) | |
| print(f"Total tasks processed: {len(results)}") | |
| print(f"Successful executions: {len(successful)}") | |
| print(f"Failed executions: {failed_count}") | |
| if scores: | |
| avg_score = sum(scores) / len(scores) | |
| print(f"Average score: {avg_score:.2f}/5") | |
| print(f"Score distribution:") | |
| for score in range(6): | |
| count = scores.count(score) | |
| if count > 0: | |
| print(f" Score {score}: {count} tasks ({count/len(scores)*100:.1f}%)") | |
| print(f"\nResults saved to: {output_path}") | |
| print("Evaluation completed successfully") | |
| if __name__ == '__main__': | |
| main() | |