Final_Assignment

Sleeping

App Files Files Community

Final_Assignment / test_youtube_question.py

tonthatthienvu

Clean repository without binary files

37cadfb 5 months ago

raw

history blame contribute delete

9.77 kB

	#!/usr/bin/env python3
	"""
	Test for YouTube question processing in GAIA system
	"""

	import os
	import sys
	import json
	from pathlib import Path
	import importlib
	import asyncio
	import re

	# Import the module containing the YouTube video analysis tool
	import gaia_tools
	from main import GAIASolver, CodeAgent, GAIA_TOOLS
	from question_classifier import QuestionClassifier
	from async_complete_test_hf import HFAsyncGAIATestSystem

	# Original analyze_youtube_video function
	original_analyze_youtube_video = gaia_tools.analyze_youtube_video

	# Create a mock analyze_youtube_video function
	def mock_analyze_youtube_video(video_url, question, max_frames=10):
	"""Mock implementation that returns a predefined answer for bird species question"""
	print(f"📹 Mock analyzing YouTube video: {video_url}")
	# Clean the URL in case there's a trailing comma
	cleaned_url = video_url.rstrip(',')

	# For the specific URL in the GAIA task
	if "L1vXCYZAYYM" in cleaned_url:
	return """
	🎥 Gemini 2.0 Flash Video+Audio Analysis
	Title: Bird Identification Challenge: Backyard Birds in Spring
	Duration: 3:42
	File Size: 45.2MB
	Question: What is the highest number of bird species to be on camera simultaneously?

	Analysis Results:
	After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3.
	This occurs at approximately 1:23 into the video, where we can see:
	1. American Robin
	2. Northern Cardinal
	3. Blue Jay

	These three species are clearly visible in the same frame at this timestamp.
	"""
	# Generic response for other URLs
	return """
	🎥 Gemini 2.0 Flash Video+Audio Analysis
	Title: Unknown Video
	Duration: Unknown
	File Size: Unknown
	Question: Unknown

	Analysis Results:
	Unable to analyze the video content. Please provide a valid YouTube URL.
	"""

	# YouTube URL regex pattern
	YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com\|youtu\.?be)/.+?(?=\s\|$)'

	def extract_youtube_url(text):
	"""Extract YouTube URL from text"""
	match = re.search(YOUTUBE_URL_PATTERN, text)
	if match:
	return match.group(0)
	return None

	def direct_force_tools_execution(solver, youtube_url, question_text):
	"""Directly execute the YouTube analysis tool via the solver's agent"""
	# Create a direct prompt that forces the YouTube analysis
	force_prompt = f"""
	You need to analyze a YouTube video to answer a specific question.

	YOUTUBE VIDEO URL: {youtube_url}
	QUESTION: {question_text}

	CRITICAL INSTRUCTIONS:
	1. Use the analyze_youtube_video tool with the provided URL
	2. Extract the answer from the tool's response
	3. Provide ONLY the final numerical answer
	"""
	# Create a fresh agent using the same approach as in GAIASolver
	print("🤖 Creating fresh agent for direct execution...")
	agent = CodeAgent(
	model=solver.model,
	tools=GAIA_TOOLS,
	max_steps=12,
	verbosity_level=1 # Lower verbosity for cleaner output
	)

	# Run the agent with the forcing prompt
	print("🔍 Running direct analysis...")
	response = agent.run(force_prompt)
	return str(response)

	def test_direct_youtube_question():
	"""Test processing of YouTube question directly"""
	# Create question with the YouTube URL
	question = {
	'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
	'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
	'Final Answer': '3' # Assuming this is the correct answer based on GAIA metadata
	}

	# Replace the function in the module with our mock
	print("🔄 Replacing YouTube analysis tool with mock implementation...")
	gaia_tools.analyze_youtube_video = mock_analyze_youtube_video

	try:
	# Initialize components after patching
	solver = GAIASolver()
	classifier = QuestionClassifier()

	# Classify the question
	print("🧩 Classifying question...")
	classification = classifier.classify_question(question['Question'])
	print(f"📋 Classification: {classification['primary_agent']}")
	print(f"🔧 Tools needed: {classification.get('tools_needed', [])}")

	# Extract YouTube URL from question
	youtube_url = extract_youtube_url(question['Question'])
	if youtube_url:
	# Remove any trailing comma
	youtube_url = youtube_url.rstrip(',')
	print(f"🔗 Extracted YouTube URL: {youtube_url}")

	# Use a direct approach to force tool execution
	print("\n🧠 Processing question with direct YouTube analyzer execution...")
	try:
	direct_result = direct_force_tools_execution(
	solver,
	youtube_url,
	"What is the highest number of bird species to be on camera simultaneously?"
	)
	print(f"\n🔍 Direct result: {direct_result}")
	except Exception as e:
	print(f"\n⚠️ Direct test error: {e}")
	direct_result = "Error in direct execution"

	# Also try the normal processing path
	print("\n🧠 Processing question with standard solver...")
	try:
	result = solver.solve_question(question)
	print(f"\n✅ Standard result: {result}")
	except Exception as e:
	print(f"\n⚠️ Standard test error: {e}")
	result = "Error in standard execution"

	# Validate result
	expected = str(question['Final Answer']).strip().lower()
	actual = str(result).strip().lower()
	validation_status = "✓ correct" if expected == actual else "✗ incorrect"
	print(f"🔎 Validation: {validation_status}")

	# If direct result contains the answer, check that too
	if "3" in direct_result:
	print(f"🔎 Direct validation: ✓ correct")
	else:
	print(f"🔎 Direct validation: ✗ incorrect")

	finally:
	# Restore original function
	print("🔄 Restoring original YouTube analysis tool...")
	gaia_tools.analyze_youtube_video = original_analyze_youtube_video

	async def test_async_youtube_question():
	"""Test processing of YouTube question using the async test system"""
	# Replace the function in the module with our mock
	print("🔄 Replacing YouTube analysis tool with mock implementation in async test...")
	gaia_tools.analyze_youtube_video = mock_analyze_youtube_video

	try:
	# Create async test system
	system = HFAsyncGAIATestSystem(
	max_concurrent=1,
	timeout_seconds=60,
	output_dir="/tmp/async_youtube_test"
	)

	# Create a single question test
	questions = [
	{
	'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
	'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
	'Final Answer': '3'
	}
	]

	# Override the load_gaia_questions method to use our single question
	async def mock_load_questions(args, *kwargs):
	return questions

	# Save the original method and replace it
	original_load_method = system.load_gaia_questions
	system.load_gaia_questions = mock_load_questions

	# Create a capturing wrapper for the solve_question method
	# Instead of replacing the solve_question method, we'll just run the test
	# Create a wrapper that ensures the mocking is active
	async def solving_wrapper():
	# Make extra sure the mock is in place during the test
	gaia_tools.analyze_youtube_video = mock_analyze_youtube_video

	# Print confirmation of active mock
	print("📹 Mock is active for async test - will analyze YouTube video")

	# Just call our wrapper to set up the mock
	await solving_wrapper()

	# Run the test
	print("🚀 Running async test with YouTube question...")
	result = await system.run_comprehensive_test(question_limit=1)

	# Print results
	print("\n📊 Async Test Results:")
	print(f"Total questions processed: {result['total_questions']}")
	print(f"Status counts: {result['status_counts']}")

	# Check answer from the first question
	question_id = questions[0]['task_id']
	if question_id in result['results']:
	question_result = result['results'][question_id]
	answer = question_result.get('answer', 'No answer')
	validation = question_result.get('validation_status', 'unknown')
	print(f"\nQuestion ID: {question_id}")
	print(f"Answer: {answer}")
	print(f"Validation: {validation}")
	else:
	print(f"No results found for question ID {question_id}")

	# Restore the original method
	system.load_gaia_questions = original_load_method

	finally:
	# Restore original function
	print("🔄 Restoring original YouTube analysis tool...")
	gaia_tools.analyze_youtube_video = original_analyze_youtube_video

	async def main():
	"""Run both tests"""
	print("🚀 Starting direct YouTube question test...")
	test_direct_youtube_question()

	print("\n\n🚀 Starting async YouTube question test...")
	await test_async_youtube_question()

	print("\n✅ All tests completed!")

	if __name__ == "__main__":
	asyncio.run(main())