Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test for YouTube question processing in GAIA system | |
| """ | |
| import os | |
| import sys | |
| import json | |
| from pathlib import Path | |
| import importlib | |
| import asyncio | |
| import re | |
| # Import the module containing the YouTube video analysis tool | |
| import gaia_tools | |
| from main import GAIASolver, CodeAgent, GAIA_TOOLS | |
| from question_classifier import QuestionClassifier | |
| from async_complete_test_hf import HFAsyncGAIATestSystem | |
| # Original analyze_youtube_video function | |
| original_analyze_youtube_video = gaia_tools.analyze_youtube_video | |
| # Create a mock analyze_youtube_video function | |
| def mock_analyze_youtube_video(video_url, question, max_frames=10): | |
| """Mock implementation that returns a predefined answer for bird species question""" | |
| print(f"πΉ Mock analyzing YouTube video: {video_url}") | |
| # Clean the URL in case there's a trailing comma | |
| cleaned_url = video_url.rstrip(',') | |
| # For the specific URL in the GAIA task | |
| if "L1vXCYZAYYM" in cleaned_url: | |
| return """ | |
| **π₯ Gemini 2.0 Flash Video+Audio Analysis** | |
| **Title:** Bird Identification Challenge: Backyard Birds in Spring | |
| **Duration:** 3:42 | |
| **File Size:** 45.2MB | |
| **Question:** What is the highest number of bird species to be on camera simultaneously? | |
| **Analysis Results:** | |
| After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3. | |
| This occurs at approximately 1:23 into the video, where we can see: | |
| 1. American Robin | |
| 2. Northern Cardinal | |
| 3. Blue Jay | |
| These three species are clearly visible in the same frame at this timestamp. | |
| """ | |
| # Generic response for other URLs | |
| return """ | |
| **π₯ Gemini 2.0 Flash Video+Audio Analysis** | |
| **Title:** Unknown Video | |
| **Duration:** Unknown | |
| **File Size:** Unknown | |
| **Question:** Unknown | |
| **Analysis Results:** | |
| Unable to analyze the video content. Please provide a valid YouTube URL. | |
| """ | |
| # YouTube URL regex pattern | |
| YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+?(?=\s|$)' | |
| def extract_youtube_url(text): | |
| """Extract YouTube URL from text""" | |
| match = re.search(YOUTUBE_URL_PATTERN, text) | |
| if match: | |
| return match.group(0) | |
| return None | |
| def direct_force_tools_execution(solver, youtube_url, question_text): | |
| """Directly execute the YouTube analysis tool via the solver's agent""" | |
| # Create a direct prompt that forces the YouTube analysis | |
| force_prompt = f""" | |
| You need to analyze a YouTube video to answer a specific question. | |
| YOUTUBE VIDEO URL: {youtube_url} | |
| QUESTION: {question_text} | |
| CRITICAL INSTRUCTIONS: | |
| 1. Use the analyze_youtube_video tool with the provided URL | |
| 2. Extract the answer from the tool's response | |
| 3. Provide ONLY the final numerical answer | |
| """ | |
| # Create a fresh agent using the same approach as in GAIASolver | |
| print("π€ Creating fresh agent for direct execution...") | |
| agent = CodeAgent( | |
| model=solver.model, | |
| tools=GAIA_TOOLS, | |
| max_steps=12, | |
| verbosity_level=1 # Lower verbosity for cleaner output | |
| ) | |
| # Run the agent with the forcing prompt | |
| print("π Running direct analysis...") | |
| response = agent.run(force_prompt) | |
| return str(response) | |
| def test_direct_youtube_question(): | |
| """Test processing of YouTube question directly""" | |
| # Create question with the YouTube URL | |
| question = { | |
| 'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', | |
| 'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?', | |
| 'Final Answer': '3' # Assuming this is the correct answer based on GAIA metadata | |
| } | |
| # Replace the function in the module with our mock | |
| print("π Replacing YouTube analysis tool with mock implementation...") | |
| gaia_tools.analyze_youtube_video = mock_analyze_youtube_video | |
| try: | |
| # Initialize components after patching | |
| solver = GAIASolver() | |
| classifier = QuestionClassifier() | |
| # Classify the question | |
| print("π§© Classifying question...") | |
| classification = classifier.classify_question(question['Question']) | |
| print(f"π Classification: {classification['primary_agent']}") | |
| print(f"π§ Tools needed: {classification.get('tools_needed', [])}") | |
| # Extract YouTube URL from question | |
| youtube_url = extract_youtube_url(question['Question']) | |
| if youtube_url: | |
| # Remove any trailing comma | |
| youtube_url = youtube_url.rstrip(',') | |
| print(f"π Extracted YouTube URL: {youtube_url}") | |
| # Use a direct approach to force tool execution | |
| print("\nπ§ Processing question with direct YouTube analyzer execution...") | |
| try: | |
| direct_result = direct_force_tools_execution( | |
| solver, | |
| youtube_url, | |
| "What is the highest number of bird species to be on camera simultaneously?" | |
| ) | |
| print(f"\nπ Direct result: {direct_result}") | |
| except Exception as e: | |
| print(f"\nβ οΈ Direct test error: {e}") | |
| direct_result = "Error in direct execution" | |
| # Also try the normal processing path | |
| print("\nπ§ Processing question with standard solver...") | |
| try: | |
| result = solver.solve_question(question) | |
| print(f"\nβ Standard result: {result}") | |
| except Exception as e: | |
| print(f"\nβ οΈ Standard test error: {e}") | |
| result = "Error in standard execution" | |
| # Validate result | |
| expected = str(question['Final Answer']).strip().lower() | |
| actual = str(result).strip().lower() | |
| validation_status = "β correct" if expected == actual else "β incorrect" | |
| print(f"π Validation: {validation_status}") | |
| # If direct result contains the answer, check that too | |
| if "3" in direct_result: | |
| print(f"π Direct validation: β correct") | |
| else: | |
| print(f"π Direct validation: β incorrect") | |
| finally: | |
| # Restore original function | |
| print("π Restoring original YouTube analysis tool...") | |
| gaia_tools.analyze_youtube_video = original_analyze_youtube_video | |
| async def test_async_youtube_question(): | |
| """Test processing of YouTube question using the async test system""" | |
| # Replace the function in the module with our mock | |
| print("π Replacing YouTube analysis tool with mock implementation in async test...") | |
| gaia_tools.analyze_youtube_video = mock_analyze_youtube_video | |
| try: | |
| # Create async test system | |
| system = HFAsyncGAIATestSystem( | |
| max_concurrent=1, | |
| timeout_seconds=60, | |
| output_dir="/tmp/async_youtube_test" | |
| ) | |
| # Create a single question test | |
| questions = [ | |
| { | |
| 'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', | |
| 'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?', | |
| 'Final Answer': '3' | |
| } | |
| ] | |
| # Override the load_gaia_questions method to use our single question | |
| async def mock_load_questions(*args, **kwargs): | |
| return questions | |
| # Save the original method and replace it | |
| original_load_method = system.load_gaia_questions | |
| system.load_gaia_questions = mock_load_questions | |
| # Create a capturing wrapper for the solve_question method | |
| # Instead of replacing the solve_question method, we'll just run the test | |
| # Create a wrapper that ensures the mocking is active | |
| async def solving_wrapper(): | |
| # Make extra sure the mock is in place during the test | |
| gaia_tools.analyze_youtube_video = mock_analyze_youtube_video | |
| # Print confirmation of active mock | |
| print("πΉ Mock is active for async test - will analyze YouTube video") | |
| # Just call our wrapper to set up the mock | |
| await solving_wrapper() | |
| # Run the test | |
| print("π Running async test with YouTube question...") | |
| result = await system.run_comprehensive_test(question_limit=1) | |
| # Print results | |
| print("\nπ Async Test Results:") | |
| print(f"Total questions processed: {result['total_questions']}") | |
| print(f"Status counts: {result['status_counts']}") | |
| # Check answer from the first question | |
| question_id = questions[0]['task_id'] | |
| if question_id in result['results']: | |
| question_result = result['results'][question_id] | |
| answer = question_result.get('answer', 'No answer') | |
| validation = question_result.get('validation_status', 'unknown') | |
| print(f"\nQuestion ID: {question_id}") | |
| print(f"Answer: {answer}") | |
| print(f"Validation: {validation}") | |
| else: | |
| print(f"No results found for question ID {question_id}") | |
| # Restore the original method | |
| system.load_gaia_questions = original_load_method | |
| finally: | |
| # Restore original function | |
| print("π Restoring original YouTube analysis tool...") | |
| gaia_tools.analyze_youtube_video = original_analyze_youtube_video | |
| async def main(): | |
| """Run both tests""" | |
| print("π Starting direct YouTube question test...") | |
| test_direct_youtube_question() | |
| print("\n\nπ Starting async YouTube question test...") | |
| await test_async_youtube_question() | |
| print("\nβ All tests completed!") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |