|
|
|
""" |
|
Test script for validating agent performance on a random GAIA question. |
|
Fetches one random question and tests the complete pipeline without submitting. |
|
""" |
|
|
|
import time |
|
from utils import fetch_random_question, format_gaia_answer |
|
from agent import smart_agent |
|
|
|
def test_predefined_gaia_question(): |
|
"""Test the agent with a predefined GAIA question to verify web search and answer format.""" |
|
|
|
print("π§ͺ Testing predefined GAIA question (1928 Olympics)") |
|
print("="*60) |
|
|
|
|
|
question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer." |
|
task_id = "predefined_test" |
|
|
|
print(f"β Question: {question}") |
|
print() |
|
|
|
|
|
print("π€ Running smart agent on the predefined question...") |
|
try: |
|
start_time = time.time() |
|
answer, reasoning_trace = smart_agent(question, task_id) |
|
end_time = time.time() |
|
|
|
processing_time = end_time - start_time |
|
print(f"β
Agent completed in {processing_time:.2f} seconds") |
|
print() |
|
|
|
except Exception as e: |
|
print(f"β Error running agent: {e}") |
|
return False |
|
|
|
|
|
print("π AGENT RESULTS") |
|
print("-" * 40) |
|
print(f"π― Formatted Answer: '{answer}'") |
|
print(f"π Reasoning Length: {len(reasoning_trace)} characters") |
|
print(f"β±οΈ Processing Time: {processing_time:.2f}s") |
|
print() |
|
|
|
|
|
print("π§ REASONING TRACE PREVIEW") |
|
print("-" * 40) |
|
reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace |
|
print(reasoning_preview) |
|
print() |
|
|
|
|
|
print("β
GAIA FORMAT VALIDATION") |
|
print("-" * 40) |
|
|
|
|
|
if answer and answer.strip(): |
|
print("β
Answer is not empty") |
|
else: |
|
print("β Answer is empty or None") |
|
return False |
|
|
|
|
|
import re |
|
if re.match(r'^[A-Z]{2,3}$', answer.strip()): |
|
print(f"β
Answer '{answer}' matches IOC country code format") |
|
else: |
|
print(f"β οΈ Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)") |
|
|
|
|
|
if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower(): |
|
print("β
Agent appears to have used web search") |
|
else: |
|
print("β οΈ No clear evidence of web search usage") |
|
|
|
|
|
if len(answer.strip()) <= 5: |
|
print("β
Answer length is appropriate for country code") |
|
else: |
|
print("β οΈ Answer seems too long for a country code") |
|
|
|
print() |
|
|
|
|
|
print("π FINAL VALIDATION") |
|
print("-" * 40) |
|
|
|
if answer and answer.strip() and len(answer.strip()) <= 5: |
|
print("β
PREDEFINED TEST PASSED - Answer format suitable for GAIA") |
|
print(f"π― Agent produced: '{answer}' for 1928 Olympics question") |
|
return True |
|
else: |
|
print("β PREDEFINED TEST FAILED - Answer format needs improvement") |
|
return False |
|
|
|
def test_random_gaia_question(): |
|
"""Test the agent with a random GAIA question and validate the complete pipeline.""" |
|
|
|
print("π§ GAIA Random Question Test") |
|
print("="*60) |
|
|
|
|
|
print("π‘ Fetching random question from GAIA API...") |
|
try: |
|
question_data = fetch_random_question() |
|
if not question_data: |
|
print("β Failed to fetch random question") |
|
return False |
|
|
|
task_id = question_data.get("task_id", "unknown") |
|
question_text = question_data.get("question", "") |
|
|
|
if not question_text: |
|
print("β No question text in response") |
|
return False |
|
|
|
print(f"β
Successfully fetched question") |
|
print(f"π Task ID: {task_id}") |
|
print(f"β Question: {question_text}") |
|
print() |
|
|
|
except Exception as e: |
|
print(f"β Error fetching question: {e}") |
|
return False |
|
|
|
|
|
print("π€ Running smart agent on the question...") |
|
try: |
|
start_time = time.time() |
|
answer, reasoning_trace = smart_agent(question_text, task_id) |
|
end_time = time.time() |
|
|
|
processing_time = end_time - start_time |
|
print(f"β
Agent completed in {processing_time:.2f} seconds") |
|
print() |
|
|
|
except Exception as e: |
|
print(f"β Error running agent: {e}") |
|
return False |
|
|
|
|
|
print("π AGENT RESULTS") |
|
print("-" * 40) |
|
print(f"π― Formatted Answer: '{answer}'") |
|
print(f"π Reasoning Length: {len(reasoning_trace)} characters") |
|
print(f"β±οΈ Processing Time: {processing_time:.2f}s") |
|
print() |
|
|
|
|
|
print("π§ REASONING TRACE PREVIEW") |
|
print("-" * 40) |
|
reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace |
|
print(reasoning_preview) |
|
print() |
|
|
|
|
|
print("β
ANSWER VALIDATION") |
|
print("-" * 40) |
|
|
|
|
|
if answer and answer.strip(): |
|
print("β
Answer is not empty") |
|
else: |
|
print("β Answer is empty or None") |
|
return False |
|
|
|
|
|
if "ERROR" in answer.upper() or "FAILED" in answer.upper(): |
|
print("β οΈ Answer contains error message") |
|
else: |
|
print("β
Answer appears to be valid (no error messages)") |
|
|
|
|
|
if len(answer) > 1000: |
|
print("β οΈ Answer is very long (>1000 chars) - might need review") |
|
else: |
|
print("β
Answer length is reasonable") |
|
|
|
print() |
|
|
|
|
|
print("π‘ SUBMISSION FORMAT PREVIEW") |
|
print("-" * 40) |
|
|
|
submission_entry = { |
|
"task_id": task_id, |
|
"model_answer": answer, |
|
"reasoning_trace": reasoning_trace |
|
} |
|
|
|
|
|
required_fields = ["task_id", "model_answer"] |
|
all_valid = True |
|
|
|
for field in required_fields: |
|
if field in submission_entry and submission_entry[field]: |
|
print(f"β
{field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'") |
|
else: |
|
print(f"β Missing or empty {field}") |
|
all_valid = False |
|
|
|
|
|
if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]: |
|
print(f"β
reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)") |
|
else: |
|
print("βΉοΈ reasoning_trace: Not present (optional)") |
|
|
|
print() |
|
|
|
|
|
print("π FINAL VALIDATION") |
|
print("-" * 40) |
|
|
|
if all_valid and answer and answer.strip(): |
|
print("β
ALL CHECKS PASSED - Agent is ready for submission!") |
|
print("π You can now run the full evaluation with confidence.") |
|
return True |
|
else: |
|
print("β SOME CHECKS FAILED - Please review the issues above.") |
|
return False |
|
|
|
if __name__ == "__main__": |
|
print("π§ͺ Testing agent with predefined GAIA question...") |
|
print("This test validates web search functionality and answer formatting.") |
|
print() |
|
|
|
|
|
success = test_predefined_gaia_question() |
|
|
|
print("\n" + "="*60) |
|
if success: |
|
print("π Predefined test completed successfully! Agent produces well-defined answers.") |
|
print("π‘ You can also run test_random_gaia_question() for additional testing.") |
|
else: |
|
print("β οΈ Predefined test revealed issues that need to be addressed.") |
|
print("="*60) |
|
|