Final_Assignment_Template / test_local.py
Denis Davydov
enhanced web search
a5c9e62
#!/usr/bin/env python3
"""
Test script for validating agent performance on a random GAIA question.
Fetches one random question and tests the complete pipeline without submitting.
"""
import time
from utils import fetch_random_question, format_gaia_answer
from agent import smart_agent
def test_predefined_gaia_question():
"""Test the agent with a predefined GAIA question to verify web search and answer format."""
print("πŸ§ͺ Testing predefined GAIA question (1928 Olympics)")
print("="*60)
# Predefined question that requires web search
question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
task_id = "predefined_test"
print(f"❓ Question: {question}")
print()
# Run the agent
print("πŸ€– Running smart agent on the predefined question...")
try:
start_time = time.time()
answer, reasoning_trace = smart_agent(question, task_id)
end_time = time.time()
processing_time = end_time - start_time
print(f"βœ… Agent completed in {processing_time:.2f} seconds")
print()
except Exception as e:
print(f"❌ Error running agent: {e}")
return False
# Display results
print("πŸ“Š AGENT RESULTS")
print("-" * 40)
print(f"🎯 Formatted Answer: '{answer}'")
print(f"πŸ“ Reasoning Length: {len(reasoning_trace)} characters")
print(f"⏱️ Processing Time: {processing_time:.2f}s")
print()
# Show reasoning trace preview
print("🧠 REASONING TRACE PREVIEW")
print("-" * 40)
reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
print(reasoning_preview)
print()
# Validate answer format for GAIA
print("βœ… GAIA FORMAT VALIDATION")
print("-" * 40)
# Check if answer is not empty
if answer and answer.strip():
print("βœ… Answer is not empty")
else:
print("❌ Answer is empty or None")
return False
# Check if answer looks like IOC country code (2-3 uppercase letters)
import re
if re.match(r'^[A-Z]{2,3}$', answer.strip()):
print(f"βœ… Answer '{answer}' matches IOC country code format")
else:
print(f"⚠️ Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")
# Check if web search was used (look for web_search in reasoning)
if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
print("βœ… Agent appears to have used web search")
else:
print("⚠️ No clear evidence of web search usage")
# Check answer length (should be short for country code)
if len(answer.strip()) <= 5:
print("βœ… Answer length is appropriate for country code")
else:
print("⚠️ Answer seems too long for a country code")
print()
# Final validation
print("🏁 FINAL VALIDATION")
print("-" * 40)
if answer and answer.strip() and len(answer.strip()) <= 5:
print("βœ… PREDEFINED TEST PASSED - Answer format suitable for GAIA")
print(f"🎯 Agent produced: '{answer}' for 1928 Olympics question")
return True
else:
print("❌ PREDEFINED TEST FAILED - Answer format needs improvement")
return False
def test_random_gaia_question():
"""Test the agent with a random GAIA question and validate the complete pipeline."""
print("πŸ”§ GAIA Random Question Test")
print("="*60)
# Step 1: Fetch a random question
print("πŸ“‘ Fetching random question from GAIA API...")
try:
question_data = fetch_random_question()
if not question_data:
print("❌ Failed to fetch random question")
return False
task_id = question_data.get("task_id", "unknown")
question_text = question_data.get("question", "")
if not question_text:
print("❌ No question text in response")
return False
print(f"βœ… Successfully fetched question")
print(f"πŸ“‹ Task ID: {task_id}")
print(f"❓ Question: {question_text}")
print()
except Exception as e:
print(f"❌ Error fetching question: {e}")
return False
# Step 2: Run the agent
print("πŸ€– Running smart agent on the question...")
try:
start_time = time.time()
answer, reasoning_trace = smart_agent(question_text, task_id)
end_time = time.time()
processing_time = end_time - start_time
print(f"βœ… Agent completed in {processing_time:.2f} seconds")
print()
except Exception as e:
print(f"❌ Error running agent: {e}")
return False
# Step 3: Display results
print("πŸ“Š AGENT RESULTS")
print("-" * 40)
print(f"🎯 Formatted Answer: '{answer}'")
print(f"πŸ“ Reasoning Length: {len(reasoning_trace)} characters")
print(f"⏱️ Processing Time: {processing_time:.2f}s")
print()
# Step 4: Show reasoning trace preview
print("🧠 REASONING TRACE PREVIEW")
print("-" * 40)
reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
print(reasoning_preview)
print()
# Step 5: Validate answer format
print("βœ… ANSWER VALIDATION")
print("-" * 40)
# Check if answer is not empty
if answer and answer.strip():
print("βœ… Answer is not empty")
else:
print("❌ Answer is empty or None")
return False
# Check if answer contains error messages
if "ERROR" in answer.upper() or "FAILED" in answer.upper():
print("⚠️ Answer contains error message")
else:
print("βœ… Answer appears to be valid (no error messages)")
# Check answer length (reasonable bounds)
if len(answer) > 1000:
print("⚠️ Answer is very long (>1000 chars) - might need review")
else:
print("βœ… Answer length is reasonable")
print()
# Step 6: Show submission format
print("πŸ“‘ SUBMISSION FORMAT PREVIEW")
print("-" * 40)
submission_entry = {
"task_id": task_id,
"model_answer": answer,
"reasoning_trace": reasoning_trace
}
# Validate required fields
required_fields = ["task_id", "model_answer"]
all_valid = True
for field in required_fields:
if field in submission_entry and submission_entry[field]:
print(f"βœ… {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
else:
print(f"❌ Missing or empty {field}")
all_valid = False
# Check optional fields
if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
print(f"βœ… reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
else:
print("ℹ️ reasoning_trace: Not present (optional)")
print()
# Step 7: Final validation
print("🏁 FINAL VALIDATION")
print("-" * 40)
if all_valid and answer and answer.strip():
print("βœ… ALL CHECKS PASSED - Agent is ready for submission!")
print("πŸš€ You can now run the full evaluation with confidence.")
return True
else:
print("❌ SOME CHECKS FAILED - Please review the issues above.")
return False
if __name__ == "__main__":
print("πŸ§ͺ Testing agent with predefined GAIA question...")
print("This test validates web search functionality and answer formatting.")
print()
# Test the predefined 1928 Olympics question
success = test_predefined_gaia_question()
print("\n" + "="*60)
if success:
print("πŸŽ‰ Predefined test completed successfully! Agent produces well-defined answers.")
print("πŸ’‘ You can also run test_random_gaia_question() for additional testing.")
else:
print("⚠️ Predefined test revealed issues that need to be addressed.")
print("="*60)