#!/usr/bin/env python3 """ Test script for validating agent performance on a random GAIA question. Fetches one random question and tests the complete pipeline without submitting. """ import time from utils import fetch_random_question, format_gaia_answer from agent import smart_agent def test_predefined_gaia_question(): """Test the agent with a predefined GAIA question to verify web search and answer format.""" print("๐Ÿงช Testing predefined GAIA question (1928 Olympics)") print("="*60) # Predefined question that requires web search question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer." task_id = "predefined_test" print(f"โ“ Question: {question}") print() # Run the agent print("๐Ÿค– Running smart agent on the predefined question...") try: start_time = time.time() answer, reasoning_trace = smart_agent(question, task_id) end_time = time.time() processing_time = end_time - start_time print(f"โœ… Agent completed in {processing_time:.2f} seconds") print() except Exception as e: print(f"โŒ Error running agent: {e}") return False # Display results print("๐Ÿ“Š AGENT RESULTS") print("-" * 40) print(f"๐ŸŽฏ Formatted Answer: '{answer}'") print(f"๐Ÿ“ Reasoning Length: {len(reasoning_trace)} characters") print(f"โฑ๏ธ Processing Time: {processing_time:.2f}s") print() # Show reasoning trace preview print("๐Ÿง  REASONING TRACE PREVIEW") print("-" * 40) reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace print(reasoning_preview) print() # Validate answer format for GAIA print("โœ… GAIA FORMAT VALIDATION") print("-" * 40) # Check if answer is not empty if answer and answer.strip(): print("โœ… Answer is not empty") else: print("โŒ Answer is empty or None") return False # Check if answer looks like IOC country code (2-3 uppercase letters) import re if re.match(r'^[A-Z]{2,3}$', answer.strip()): print(f"โœ… Answer '{answer}' matches IOC country code format") else: print(f"โš ๏ธ Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)") # Check if web search was used (look for web_search in reasoning) if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower(): print("โœ… Agent appears to have used web search") else: print("โš ๏ธ No clear evidence of web search usage") # Check answer length (should be short for country code) if len(answer.strip()) <= 5: print("โœ… Answer length is appropriate for country code") else: print("โš ๏ธ Answer seems too long for a country code") print() # Final validation print("๐Ÿ FINAL VALIDATION") print("-" * 40) if answer and answer.strip() and len(answer.strip()) <= 5: print("โœ… PREDEFINED TEST PASSED - Answer format suitable for GAIA") print(f"๐ŸŽฏ Agent produced: '{answer}' for 1928 Olympics question") return True else: print("โŒ PREDEFINED TEST FAILED - Answer format needs improvement") return False def test_random_gaia_question(): """Test the agent with a random GAIA question and validate the complete pipeline.""" print("๐Ÿ”ง GAIA Random Question Test") print("="*60) # Step 1: Fetch a random question print("๐Ÿ“ก Fetching random question from GAIA API...") try: question_data = fetch_random_question() if not question_data: print("โŒ Failed to fetch random question") return False task_id = question_data.get("task_id", "unknown") question_text = question_data.get("question", "") if not question_text: print("โŒ No question text in response") return False print(f"โœ… Successfully fetched question") print(f"๐Ÿ“‹ Task ID: {task_id}") print(f"โ“ Question: {question_text}") print() except Exception as e: print(f"โŒ Error fetching question: {e}") return False # Step 2: Run the agent print("๐Ÿค– Running smart agent on the question...") try: start_time = time.time() answer, reasoning_trace = smart_agent(question_text, task_id) end_time = time.time() processing_time = end_time - start_time print(f"โœ… Agent completed in {processing_time:.2f} seconds") print() except Exception as e: print(f"โŒ Error running agent: {e}") return False # Step 3: Display results print("๐Ÿ“Š AGENT RESULTS") print("-" * 40) print(f"๐ŸŽฏ Formatted Answer: '{answer}'") print(f"๐Ÿ“ Reasoning Length: {len(reasoning_trace)} characters") print(f"โฑ๏ธ Processing Time: {processing_time:.2f}s") print() # Step 4: Show reasoning trace preview print("๐Ÿง  REASONING TRACE PREVIEW") print("-" * 40) reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace print(reasoning_preview) print() # Step 5: Validate answer format print("โœ… ANSWER VALIDATION") print("-" * 40) # Check if answer is not empty if answer and answer.strip(): print("โœ… Answer is not empty") else: print("โŒ Answer is empty or None") return False # Check if answer contains error messages if "ERROR" in answer.upper() or "FAILED" in answer.upper(): print("โš ๏ธ Answer contains error message") else: print("โœ… Answer appears to be valid (no error messages)") # Check answer length (reasonable bounds) if len(answer) > 1000: print("โš ๏ธ Answer is very long (>1000 chars) - might need review") else: print("โœ… Answer length is reasonable") print() # Step 6: Show submission format print("๐Ÿ“ก SUBMISSION FORMAT PREVIEW") print("-" * 40) submission_entry = { "task_id": task_id, "model_answer": answer, "reasoning_trace": reasoning_trace } # Validate required fields required_fields = ["task_id", "model_answer"] all_valid = True for field in required_fields: if field in submission_entry and submission_entry[field]: print(f"โœ… {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'") else: print(f"โŒ Missing or empty {field}") all_valid = False # Check optional fields if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]: print(f"โœ… reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)") else: print("โ„น๏ธ reasoning_trace: Not present (optional)") print() # Step 7: Final validation print("๐Ÿ FINAL VALIDATION") print("-" * 40) if all_valid and answer and answer.strip(): print("โœ… ALL CHECKS PASSED - Agent is ready for submission!") print("๐Ÿš€ You can now run the full evaluation with confidence.") return True else: print("โŒ SOME CHECKS FAILED - Please review the issues above.") return False if __name__ == "__main__": print("๐Ÿงช Testing agent with predefined GAIA question...") print("This test validates web search functionality and answer formatting.") print() # Test the predefined 1928 Olympics question success = test_predefined_gaia_question() print("\n" + "="*60) if success: print("๐ŸŽ‰ Predefined test completed successfully! Agent produces well-defined answers.") print("๐Ÿ’ก You can also run test_random_gaia_question() for additional testing.") else: print("โš ๏ธ Predefined test revealed issues that need to be addressed.") print("="*60)