Final_Assignment_Template

Sleeping

Final_Assignment_Template / test_local.py

Denis Davydov

enhanced web search

a5c9e62 about 2 months ago

8.3 kB

	#!/usr/bin/env python3
	"""
	Test script for validating agent performance on a random GAIA question.
	Fetches one random question and tests the complete pipeline without submitting.
	"""

	import time
	from utils import fetch_random_question, format_gaia_answer
	from agent import smart_agent

	def test_predefined_gaia_question():
	"""Test the agent with a predefined GAIA question to verify web search and answer format."""

	print("🧪 Testing predefined GAIA question (1928 Olympics)")
	print("="*60)

	# Predefined question that requires web search
	question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
	task_id = "predefined_test"

	print(f"❓ Question: {question}")
	print()

	# Run the agent
	print("🤖 Running smart agent on the predefined question...")
	try:
	start_time = time.time()
	answer, reasoning_trace = smart_agent(question, task_id)
	end_time = time.time()

	processing_time = end_time - start_time
	print(f"✅ Agent completed in {processing_time:.2f} seconds")
	print()

	except Exception as e:
	print(f"❌ Error running agent: {e}")
	return False

	# Display results
	print("📊 AGENT RESULTS")
	print("-" * 40)
	print(f"🎯 Formatted Answer: '{answer}'")
	print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
	print(f"⏱️ Processing Time: {processing_time:.2f}s")
	print()

	# Show reasoning trace preview
	print("🧠 REASONING TRACE PREVIEW")
	print("-" * 40)
	reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
	print(reasoning_preview)
	print()

	# Validate answer format for GAIA
	print("✅ GAIA FORMAT VALIDATION")
	print("-" * 40)

	# Check if answer is not empty
	if answer and answer.strip():
	print("✅ Answer is not empty")
	else:
	print("❌ Answer is empty or None")
	return False

	# Check if answer looks like IOC country code (2-3 uppercase letters)
	import re
	if re.match(r'^[A-Z]{2,3}$', answer.strip()):
	print(f"✅ Answer '{answer}' matches IOC country code format")
	else:
	print(f"⚠️ Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")

	# Check if web search was used (look for web_search in reasoning)
	if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
	print("✅ Agent appears to have used web search")
	else:
	print("⚠️ No clear evidence of web search usage")

	# Check answer length (should be short for country code)
	if len(answer.strip()) <= 5:
	print("✅ Answer length is appropriate for country code")
	else:
	print("⚠️ Answer seems too long for a country code")

	print()

	# Final validation
	print("🏁 FINAL VALIDATION")
	print("-" * 40)

	if answer and answer.strip() and len(answer.strip()) <= 5:
	print("✅ PREDEFINED TEST PASSED - Answer format suitable for GAIA")
	print(f"🎯 Agent produced: '{answer}' for 1928 Olympics question")
	return True
	else:
	print("❌ PREDEFINED TEST FAILED - Answer format needs improvement")
	return False

	def test_random_gaia_question():
	"""Test the agent with a random GAIA question and validate the complete pipeline."""

	print("🔧 GAIA Random Question Test")
	print("="*60)

	# Step 1: Fetch a random question
	print("📡 Fetching random question from GAIA API...")
	try:
	question_data = fetch_random_question()
	if not question_data:
	print("❌ Failed to fetch random question")
	return False

	task_id = question_data.get("task_id", "unknown")
	question_text = question_data.get("question", "")

	if not question_text:
	print("❌ No question text in response")
	return False

	print(f"✅ Successfully fetched question")
	print(f"📋 Task ID: {task_id}")
	print(f"❓ Question: {question_text}")
	print()

	except Exception as e:
	print(f"❌ Error fetching question: {e}")
	return False

	# Step 2: Run the agent
	print("🤖 Running smart agent on the question...")
	try:
	start_time = time.time()
	answer, reasoning_trace = smart_agent(question_text, task_id)
	end_time = time.time()

	processing_time = end_time - start_time
	print(f"✅ Agent completed in {processing_time:.2f} seconds")
	print()

	except Exception as e:
	print(f"❌ Error running agent: {e}")
	return False

	# Step 3: Display results
	print("📊 AGENT RESULTS")
	print("-" * 40)
	print(f"🎯 Formatted Answer: '{answer}'")
	print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
	print(f"⏱️ Processing Time: {processing_time:.2f}s")
	print()

	# Step 4: Show reasoning trace preview
	print("🧠 REASONING TRACE PREVIEW")
	print("-" * 40)
	reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
	print(reasoning_preview)
	print()

	# Step 5: Validate answer format
	print("✅ ANSWER VALIDATION")
	print("-" * 40)

	# Check if answer is not empty
	if answer and answer.strip():
	print("✅ Answer is not empty")
	else:
	print("❌ Answer is empty or None")
	return False

	# Check if answer contains error messages
	if "ERROR" in answer.upper() or "FAILED" in answer.upper():
	print("⚠️ Answer contains error message")
	else:
	print("✅ Answer appears to be valid (no error messages)")

	# Check answer length (reasonable bounds)
	if len(answer) > 1000:
	print("⚠️ Answer is very long (>1000 chars) - might need review")
	else:
	print("✅ Answer length is reasonable")

	print()

	# Step 6: Show submission format
	print("📡 SUBMISSION FORMAT PREVIEW")
	print("-" * 40)

	submission_entry = {
	"task_id": task_id,
	"model_answer": answer,
	"reasoning_trace": reasoning_trace
	}

	# Validate required fields
	required_fields = ["task_id", "model_answer"]
	all_valid = True

	for field in required_fields:
	if field in submission_entry and submission_entry[field]:
	print(f"✅ {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
	else:
	print(f"❌ Missing or empty {field}")
	all_valid = False

	# Check optional fields
	if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
	print(f"✅ reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
	else:
	print("ℹ️ reasoning_trace: Not present (optional)")

	print()

	# Step 7: Final validation
	print("🏁 FINAL VALIDATION")
	print("-" * 40)

	if all_valid and answer and answer.strip():
	print("✅ ALL CHECKS PASSED - Agent is ready for submission!")
	print("🚀 You can now run the full evaluation with confidence.")
	return True
	else:
	print("❌ SOME CHECKS FAILED - Please review the issues above.")
	return False

	if __name__ == "__main__":
	print("🧪 Testing agent with predefined GAIA question...")
	print("This test validates web search functionality and answer formatting.")
	print()

	# Test the predefined 1928 Olympics question
	success = test_predefined_gaia_question()

	print("\n" + "="*60)
	if success:
	print("🎉 Predefined test completed successfully! Agent produces well-defined answers.")
	print("💡 You can also run test_random_gaia_question() for additional testing.")
	else:
	print("⚠️ Predefined test revealed issues that need to be addressed.")
	print("="*60)