Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Logged Clean Test - Test all questions with proper logging and no overrides | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Add parent directory to path for imports | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| # Local imports | |
| from gaia_web_loader import GAIAQuestionLoaderWeb | |
| from main import GAIASolver | |
| from question_classifier import QuestionClassifier | |
| from tests.test_logging_utils import test_logger | |
| def load_validation_answers(): | |
| """Load correct answers from GAIA validation metadata""" | |
| answers = {} | |
| try: | |
| validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl' | |
| with open(validation_path, 'r') as f: | |
| for line in f: | |
| if line.strip(): | |
| data = json.loads(line.strip()) | |
| task_id = data.get('task_id') | |
| final_answer = data.get('Final answer') | |
| if task_id and final_answer: | |
| answers[task_id] = final_answer | |
| except Exception as e: | |
| print(f"β οΈ Could not load validation data: {e}") | |
| return answers | |
| def validate_answer(task_id: str, our_answer: str, validation_answers: dict): | |
| """Validate our answer against the correct answer""" | |
| if task_id not in validation_answers: | |
| return None | |
| expected = str(validation_answers[task_id]).strip() | |
| our_clean = str(our_answer).strip() | |
| # Exact match | |
| if our_clean.lower() == expected.lower(): | |
| return {"status": "CORRECT", "expected": expected, "our": our_clean} | |
| # Check if our answer contains the expected answer | |
| if expected.lower() in our_clean.lower(): | |
| return {"status": "PARTIAL", "expected": expected, "our": our_clean} | |
| return {"status": "INCORRECT", "expected": expected, "our": our_clean} | |
| def test_single_question(question_data, validation_answers, model="qwen3-235b"): | |
| """Test a single question without any overrides - WITH LOGGING""" | |
| task_id = question_data.get('task_id', 'unknown') | |
| # Use the same logging approach as test_specific_question.py | |
| with test_logger("clean_batch_question", task_id): | |
| try: | |
| print(f"π§ͺ Testing question: {task_id}") | |
| print("=" * 60) | |
| # Initialize solver and classifier | |
| print(f"π Initializing GAIA Solver with Kluster.ai {model}...") | |
| solver = GAIASolver(use_kluster=True, kluster_model=model) | |
| print("π§ Initializing Question Classifier...") | |
| classifier = QuestionClassifier() | |
| # Display question details | |
| print(f"β Found question!") | |
| print(f"π Question: {question_data.get('question', 'N/A')}") | |
| print(f"π·οΈ Level: {question_data.get('Level', 'Unknown')}") | |
| print(f"π Has file: {'Yes' if question_data.get('file_name') else 'No'}") | |
| if question_data.get('file_name'): | |
| print(f"π File: {question_data.get('file_name')}") | |
| # Classify the question | |
| print(f"\nπ§ QUESTION CLASSIFICATION:") | |
| print("-" * 40) | |
| question_text = question_data.get('question', '') | |
| file_name = question_data.get('file_name', '') | |
| classification = classifier.classify_question(question_text, file_name) | |
| print(f"π― Primary Agent: {classification['primary_agent']}") | |
| if classification['secondary_agents']: | |
| print(f"π€ Secondary Agents: {', '.join(classification['secondary_agents'])}") | |
| print(f"π Complexity: {classification['complexity']}/5") | |
| print(f"π² Confidence: {classification['confidence']:.3f}") | |
| print(f"π§ Tools Needed: {', '.join(classification['tools_needed'][:3])}") | |
| if len(classification['tools_needed']) > 3: | |
| print(f" (+{len(classification['tools_needed'])-3} more tools)") | |
| print(f"π Reasoning: {classification['reasoning']}") | |
| # Solve the question (NO OVERRIDES - pure LLM reasoning) | |
| print(f"\nπ€ Solving question...") | |
| print(f"π― Question type: {classification['primary_agent']}") | |
| print(f"π Processing... (NO OVERRIDES - Pure LLM + Tools)") | |
| start_time = time.time() | |
| answer = solver.solve_question(question_data) | |
| end_time = time.time() | |
| duration = end_time - start_time | |
| print(f"β Completed in {duration:.1f} seconds") | |
| # Validate answer | |
| print(f"\nπ ANSWER VALIDATION:") | |
| print("-" * 40) | |
| validation_result = validate_answer(task_id, answer, validation_answers) | |
| if validation_result: | |
| print(f"Expected Answer: {validation_result['expected']}") | |
| print(f"Our Answer: {validation_result['our']}") | |
| print(f"Status: {validation_result['status']}") | |
| if validation_result['status'] == 'CORRECT': | |
| print(f"β PERFECT MATCH!") | |
| elif validation_result['status'] == 'PARTIAL': | |
| print(f"π‘ PARTIAL MATCH - contains correct answer") | |
| else: | |
| print(f"β INCORRECT - answers don't match") | |
| else: | |
| print(f"β οΈ No validation data available for question {task_id}") | |
| print(f"\nπ FINAL RESULTS:") | |
| print("=" * 60) | |
| print(f"Task ID: {task_id}") | |
| print(f"Question Type: {classification['primary_agent']}") | |
| print(f"Classification Confidence: {classification['confidence']:.3f}") | |
| print(f"Our Answer: {answer}") | |
| if validation_result: | |
| print(f"Expected Answer: {validation_result['expected']}") | |
| print(f"Validation Status: {validation_result['status']}") | |
| print(f"Duration: {duration:.1f}s") | |
| print(f"π« NO OVERRIDES APPLIED - Pure LLM reasoning") | |
| result = { | |
| 'task_id': task_id, | |
| 'question_type': classification['primary_agent'], | |
| 'complexity': classification['complexity'], | |
| 'confidence': classification['confidence'], | |
| 'our_answer': str(answer), | |
| 'expected_answer': validation_result['expected'] if validation_result else 'N/A', | |
| 'status': validation_result['status'] if validation_result else 'NO_VALIDATION', | |
| 'duration': duration, | |
| 'question_preview': question_data.get('question', '')[:50] + "..." | |
| } | |
| status_icon = "β " if result['status'] == "CORRECT" else "π‘" if result['status'] == "PARTIAL" else "β" | |
| print(f"\n{status_icon} FINAL STATUS: {result['status']}") | |
| return result | |
| except Exception as e: | |
| print(f"β Error testing question: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return { | |
| 'task_id': task_id, | |
| 'question_type': 'error', | |
| 'complexity': 0, | |
| 'confidence': 0.0, | |
| 'our_answer': '', | |
| 'expected_answer': validation_answers.get(task_id, 'N/A'), | |
| 'status': 'ERROR', | |
| 'duration': 0.0, | |
| 'error': str(e), | |
| 'question_preview': question_data.get('question', '')[:50] + "..." | |
| } | |
| def run_logged_clean_test(): | |
| """Run logged clean test on all questions""" | |
| print("π§ͺ LOGGED CLEAN TEST - NO OVERRIDES") | |
| print("=" * 60) | |
| print("π― Goal: Measure real accuracy with full logging") | |
| print("π« No hardcoded answers or overrides") | |
| print("π€ Pure LLM + Tools reasoning only") | |
| print("π Full detailed logs will be created") | |
| print() | |
| # Load questions and validation data | |
| print("π Loading GAIA questions...") | |
| loader = GAIAQuestionLoaderWeb() | |
| all_questions = loader.questions | |
| validation_answers = load_validation_answers() | |
| print(f"β Loaded {len(all_questions)} questions") | |
| print(f"β Loaded {len(validation_answers)} validation answers") | |
| # Show question preview | |
| print(f"\nπ Questions to test:") | |
| for i, q in enumerate(all_questions[:3]): # Show first 3 | |
| task_id = q.get('task_id', 'unknown') | |
| question_preview = q.get('question', '')[:40] + "..." | |
| level = q.get('Level', 'Unknown') | |
| expected = validation_answers.get(task_id, 'N/A') | |
| has_file = "π" if q.get('file_name') else "π" | |
| print(f" {i+1}. {task_id[:8]}... | L{level} | {has_file} | Expected: {expected}") | |
| print(f" {question_preview}") | |
| if len(all_questions) > 3: | |
| print(f" ... and {len(all_questions) - 3} more questions") | |
| print(f"\nπ Starting logged clean test...") | |
| print(f"π Each question will create a detailed log file") | |
| print(f"β±οΈ Estimated time: ~{len(all_questions) * 2} minutes") | |
| # Process first 3 questions for demonstration (you can change this) | |
| test_questions = all_questions[:3] # Test first 3 questions | |
| start_time = time.time() | |
| results = [] | |
| for i, question_data in enumerate(test_questions): | |
| print(f"\n" + "="*80) | |
| print(f"π PROGRESS: {i+1}/{len(test_questions)}") | |
| print(f"π Processing question {question_data.get('task_id', 'unknown')[:8]}...") | |
| result = test_single_question(question_data, validation_answers) | |
| results.append(result) | |
| # Show progress | |
| completed = i + 1 | |
| correct_so_far = len([r for r in results if r['status'] == 'CORRECT']) | |
| current_accuracy = correct_so_far / completed * 100 | |
| print(f"π Current accuracy: {current_accuracy:.1f}% ({correct_so_far}/{completed})") | |
| end_time = time.time() | |
| total_duration = end_time - start_time | |
| # Final analysis | |
| print(f"\n" + "=" * 80) | |
| print(f"π LOGGED CLEAN TEST RESULTS") | |
| print(f"=" * 80) | |
| # Calculate metrics | |
| total_questions = len(results) | |
| correct_answers = len([r for r in results if r['status'] == 'CORRECT']) | |
| partial_answers = len([r for r in results if r['status'] == 'PARTIAL']) | |
| incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT']) | |
| errors = len([r for r in results if r['status'] == 'ERROR']) | |
| accuracy_rate = correct_answers / total_questions * 100 | |
| success_rate = (correct_answers + partial_answers) / total_questions * 100 | |
| print(f"β±οΈ Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s") | |
| print(f"β **HONEST ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})") | |
| print(f"π― Success Rate: {success_rate:.1f}% (including partial)") | |
| print(f"β‘ Avg per Question: {total_duration/total_questions:.1f}s") | |
| print(f"\nπ DETAILED BREAKDOWN:") | |
| print(f" β CORRECT: {correct_answers} ({correct_answers/total_questions:.1%})") | |
| print(f" π‘ PARTIAL: {partial_answers} ({partial_answers/total_questions:.1%})") | |
| print(f" β INCORRECT: {incorrect_answers} ({incorrect_answers/total_questions:.1%})") | |
| print(f" π₯ ERROR: {errors} ({errors/total_questions:.1%})") | |
| # Question-by-question results | |
| print(f"\nπ DETAILED QUESTION RESULTS:") | |
| for i, result in enumerate(results): | |
| status_icon = "β " if result['status'] == "CORRECT" else "π‘" if result['status'] == "PARTIAL" else "β" | |
| print(f" {i+1}. {status_icon} {result['task_id'][:8]}... | {result['question_type']:12} | {result['status']:9} | {result['duration']:5.1f}s") | |
| print(f" Expected: {result['expected_answer']}") | |
| print(f" Got: {result['our_answer']}") | |
| if 'error' in result: | |
| print(f" Error: {result['error']}") | |
| # Save results | |
| timestamp = time.strftime("%Y%m%d_%H%M%S") | |
| results_file = f"logs/logged_clean_test_{timestamp}.json" | |
| with open(results_file, 'w') as f: | |
| json.dump({ | |
| 'test_metadata': { | |
| 'timestamp': timestamp, | |
| 'test_type': 'logged_clean_test_no_overrides', | |
| 'total_questions': total_questions, | |
| 'duration_seconds': total_duration, | |
| 'model': 'qwen3-235b', | |
| 'note': 'Pure LLM reasoning with full logging' | |
| }, | |
| 'metrics': { | |
| 'accuracy_rate': accuracy_rate, | |
| 'success_rate': success_rate, | |
| 'correct_answers': correct_answers, | |
| 'partial_answers': partial_answers, | |
| 'incorrect_answers': incorrect_answers, | |
| 'errors': errors | |
| }, | |
| 'detailed_results': results | |
| }, f, indent=2) | |
| print(f"\nπ Results summary saved to: {results_file}") | |
| print(f"π Individual question logs saved to: logs/clean_batch_question_<id>_*.log") | |
| # Final assessment | |
| print(f"\nπ― HONEST ASSESSMENT:") | |
| print(f"π« NO CHEATING - Pure LLM reasoning only") | |
| print(f"π **Real System Accuracy: {accuracy_rate:.1f}%**") | |
| if accuracy_rate >= 70: | |
| print(f"π EXCELLENT: Achieves 70%+ target!") | |
| elif accuracy_rate >= 50: | |
| print(f"π§ GOOD: Solid performance, room for improvement") | |
| elif accuracy_rate >= 30: | |
| print(f"β οΈ MODERATE: Needs significant improvements") | |
| else: | |
| print(f"π¨ POOR: Requires major system overhaul") | |
| print(f"\nπ Check the log files for detailed execution traces!") | |
| return accuracy_rate, results | |
| if __name__ == "__main__": | |
| accuracy, results = run_logged_clean_test() | |
| print(f"\nπ Logged clean test completed!") | |
| print(f"π **HONEST ACCURACY: {accuracy:.1f}%**") | |
| print(f"π Full logs available in logs/ directory") |