#!/usr/bin/env python3
"""
Test script for validating agent performance on a random GAIA question.
Fetches one random question and tests the complete pipeline without submitting.
"""

import time
from utils import fetch_random_question, format_gaia_answer
from agent import smart_agent

def test_predefined_gaia_question():
    """Test the agent with a predefined GAIA question to verify web search and answer format."""
    
    print("🧪 Testing predefined GAIA question (1928 Olympics)")
    print("="*60)
    
    # Predefined question that requires web search
    question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
    task_id = "predefined_test"
    
    print(f"❓ Question: {question}")
    print()
    
    # Run the agent
    print("🤖 Running smart agent on the predefined question...")
    try:
        start_time = time.time()
        answer, reasoning_trace = smart_agent(question, task_id)
        end_time = time.time()
        
        processing_time = end_time - start_time
        print(f"✅ Agent completed in {processing_time:.2f} seconds")
        print()
        
    except Exception as e:
        print(f"❌ Error running agent: {e}")
        return False
    
    # Display results
    print("📊 AGENT RESULTS")
    print("-" * 40)
    print(f"🎯 Formatted Answer: '{answer}'")
    print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
    print(f"⏱️  Processing Time: {processing_time:.2f}s")
    print()
    
    # Show reasoning trace preview
    print("🧠 REASONING TRACE PREVIEW")
    print("-" * 40)
    reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
    print(reasoning_preview)
    print()
    
    # Validate answer format for GAIA
    print("✅ GAIA FORMAT VALIDATION")
    print("-" * 40)
    
    # Check if answer is not empty
    if answer and answer.strip():
        print("✅ Answer is not empty")
    else:
        print("❌ Answer is empty or None")
        return False
    
    # Check if answer looks like IOC country code (2-3 uppercase letters)
    import re
    if re.match(r'^[A-Z]{2,3}$', answer.strip()):
        print(f"✅ Answer '{answer}' matches IOC country code format")
    else:
        print(f"⚠️  Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")
    
    # Check if web search was used (look for web_search in reasoning)
    if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
        print("✅ Agent appears to have used web search")
    else:
        print("⚠️  No clear evidence of web search usage")
    
    # Check answer length (should be short for country code)
    if len(answer.strip()) <= 5:
        print("✅ Answer length is appropriate for country code")
    else:
        print("⚠️  Answer seems too long for a country code")
    
    print()
    
    # Final validation
    print("🏁 FINAL VALIDATION")
    print("-" * 40)
    
    if answer and answer.strip() and len(answer.strip()) <= 5:
        print("✅ PREDEFINED TEST PASSED - Answer format suitable for GAIA")
        print(f"🎯 Agent produced: '{answer}' for 1928 Olympics question")
        return True
    else:
        print("❌ PREDEFINED TEST FAILED - Answer format needs improvement")
        return False

def test_random_gaia_question():
    """Test the agent with a random GAIA question and validate the complete pipeline."""
    
    print("🔧 GAIA Random Question Test")
    print("="*60)
    
    # Step 1: Fetch a random question
    print("📡 Fetching random question from GAIA API...")
    try:
        question_data = fetch_random_question()
        if not question_data:
            print("❌ Failed to fetch random question")
            return False
        
        task_id = question_data.get("task_id", "unknown")
        question_text = question_data.get("question", "")
        
        if not question_text:
            print("❌ No question text in response")
            return False
            
        print(f"✅ Successfully fetched question")
        print(f"📋 Task ID: {task_id}")
        print(f"❓ Question: {question_text}")
        print()
        
    except Exception as e:
        print(f"❌ Error fetching question: {e}")
        return False
    
    # Step 2: Run the agent
    print("🤖 Running smart agent on the question...")
    try:
        start_time = time.time()
        answer, reasoning_trace = smart_agent(question_text, task_id)
        end_time = time.time()
        
        processing_time = end_time - start_time
        print(f"✅ Agent completed in {processing_time:.2f} seconds")
        print()
        
    except Exception as e:
        print(f"❌ Error running agent: {e}")
        return False
    
    # Step 3: Display results
    print("📊 AGENT RESULTS")
    print("-" * 40)
    print(f"🎯 Formatted Answer: '{answer}'")
    print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
    print(f"⏱️  Processing Time: {processing_time:.2f}s")
    print()
    
    # Step 4: Show reasoning trace preview
    print("🧠 REASONING TRACE PREVIEW")
    print("-" * 40)
    reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
    print(reasoning_preview)
    print()
    
    # Step 5: Validate answer format
    print("✅ ANSWER VALIDATION")
    print("-" * 40)
    
    # Check if answer is not empty
    if answer and answer.strip():
        print("✅ Answer is not empty")
    else:
        print("❌ Answer is empty or None")
        return False
    
    # Check if answer contains error messages
    if "ERROR" in answer.upper() or "FAILED" in answer.upper():
        print("⚠️  Answer contains error message")
    else:
        print("✅ Answer appears to be valid (no error messages)")
    
    # Check answer length (reasonable bounds)
    if len(answer) > 1000:
        print("⚠️  Answer is very long (>1000 chars) - might need review")
    else:
        print("✅ Answer length is reasonable")
    
    print()
    
    # Step 6: Show submission format
    print("📡 SUBMISSION FORMAT PREVIEW")
    print("-" * 40)
    
    submission_entry = {
        "task_id": task_id,
        "model_answer": answer,
        "reasoning_trace": reasoning_trace
    }
    
    # Validate required fields
    required_fields = ["task_id", "model_answer"]
    all_valid = True
    
    for field in required_fields:
        if field in submission_entry and submission_entry[field]:
            print(f"✅ {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
        else:
            print(f"❌ Missing or empty {field}")
            all_valid = False
    
    # Check optional fields
    if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
        print(f"✅ reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
    else:
        print("ℹ️  reasoning_trace: Not present (optional)")
    
    print()
    
    # Step 7: Final validation
    print("🏁 FINAL VALIDATION")
    print("-" * 40)
    
    if all_valid and answer and answer.strip():
        print("✅ ALL CHECKS PASSED - Agent is ready for submission!")
        print("🚀 You can now run the full evaluation with confidence.")
        return True
    else:
        print("❌ SOME CHECKS FAILED - Please review the issues above.")
        return False

if __name__ == "__main__":
    print("🧪 Testing agent with predefined GAIA question...")
    print("This test validates web search functionality and answer formatting.")
    print()
    
    # Test the predefined 1928 Olympics question
    success = test_predefined_gaia_question()
    
    print("\n" + "="*60)
    if success:
        print("🎉 Predefined test completed successfully! Agent produces well-defined answers.")
        print("💡 You can also run test_random_gaia_question() for additional testing.")
    else:
        print("⚠️  Predefined test revealed issues that need to be addressed.")
    print("="*60)