Spaces:
Running
Running
""" | |
GAIA-Style Test Questions for End-to-End Validation | |
Based on actual GAIA evaluation scenarios and question patterns. | |
This module contains test questions that mirror the complexity and style | |
of questions used in the GAIA evaluation, organized by category and difficulty. | |
""" | |
import pytest | |
import sys | |
import os | |
import tempfile | |
import json | |
from pathlib import Path | |
from typing import Dict, List, Any, Optional | |
# Add the deployment-ready directory to the path | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) | |
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent | |
class GAIAStyleTestQuestions: | |
"""Collection of GAIA-style test questions for comprehensive evaluation.""" | |
def __init__(self): | |
"""Initialize with test question categories.""" | |
self.agent = FixedGAIAAgent() | |
# Mathematical and computational questions | |
self.mathematical_questions = [ | |
{ | |
'id': 'math_001', | |
'question': 'What is 25 * 17?', | |
'expected_answer': '425', | |
'category': 'basic_math', | |
'tools_required': ['calculator'], | |
'difficulty': 'easy' | |
}, | |
{ | |
'id': 'math_002', | |
'question': 'Calculate the factorial of 7', | |
'expected_answer': '5040', | |
'category': 'advanced_math', | |
'tools_required': ['python'], | |
'difficulty': 'medium' | |
}, | |
{ | |
'id': 'math_003', | |
'question': 'What is the square root of 144?', | |
'expected_answer': '12', | |
'category': 'basic_math', | |
'tools_required': ['calculator'], | |
'difficulty': 'easy' | |
}, | |
{ | |
'id': 'math_004', | |
'question': 'Calculate 2^10', | |
'expected_answer': '1024', | |
'category': 'basic_math', | |
'tools_required': ['calculator'], | |
'difficulty': 'easy' | |
} | |
] | |
# Knowledge and research questions | |
self.knowledge_questions = [ | |
{ | |
'id': 'know_001', | |
'question': 'What is the capital of France?', | |
'expected_answer': 'Paris', | |
'category': 'geography', | |
'tools_required': ['wikipedia'], | |
'difficulty': 'easy' | |
}, | |
{ | |
'id': 'know_002', | |
'question': 'In what year was the Eiffel Tower completed?', | |
'expected_answer': '1889', | |
'category': 'history', | |
'tools_required': ['wikipedia'], | |
'difficulty': 'medium' | |
}, | |
{ | |
'id': 'know_003', | |
'question': 'How many studio albums were published by Mercedes Sosa between 2000 and 2009?', | |
'expected_answer': None, # Requires research | |
'category': 'music_research', | |
'tools_required': ['wikipedia', 'web_search'], | |
'difficulty': 'hard' | |
}, | |
{ | |
'id': 'know_004', | |
'question': 'What is the highest number of bird species to be on camera simultaneously?', | |
'expected_answer': None, # Requires research | |
'category': 'nature_research', | |
'tools_required': ['web_search'], | |
'difficulty': 'hard' | |
} | |
] | |
# File-based questions with attachments | |
self.file_based_questions = [ | |
{ | |
'id': 'file_001', | |
'question': 'What is the final numeric output from the attached Python code?', | |
'expected_answer': '425', | |
'category': 'code_execution', | |
'tools_required': ['python', 'file'], | |
'difficulty': 'medium', | |
'file_content': self._create_python_code_file() | |
}, | |
{ | |
'id': 'file_002', | |
'question': 'What is the sum of all values in the "amount" column of the attached CSV file?', | |
'expected_answer': '150', | |
'category': 'data_analysis', | |
'tools_required': ['python', 'file'], | |
'difficulty': 'medium', | |
'file_content': self._create_csv_data_file() | |
}, | |
{ | |
'id': 'file_003', | |
'question': 'What is the value of the "result" field in the attached JSON file?', | |
'expected_answer': '256', | |
'category': 'data_extraction', | |
'tools_required': ['file'], | |
'difficulty': 'easy', | |
'file_content': self._create_json_data_file() | |
} | |
] | |
# Multimodal questions (images, audio, documents) | |
self.multimodal_questions = [ | |
{ | |
'id': 'multi_001', | |
'question': 'How many objects are visible in this image?', | |
'expected_answer': '3', | |
'category': 'image_analysis', | |
'tools_required': ['multimodal'], | |
'difficulty': 'medium', | |
'file_content': self._create_image_description_file() | |
}, | |
{ | |
'id': 'multi_002', | |
'question': 'What is the main topic discussed in this document?', | |
'expected_answer': 'artificial intelligence', | |
'category': 'document_analysis', | |
'tools_required': ['multimodal', 'file'], | |
'difficulty': 'medium', | |
'file_content': self._create_document_file() | |
} | |
] | |
# Complex multi-step questions | |
self.complex_questions = [ | |
{ | |
'id': 'complex_001', | |
'question': 'Calculate the square root of 144, then find information about that number in mathematics', | |
'expected_answer': None, # Complex answer | |
'category': 'multi_step', | |
'tools_required': ['calculator', 'wikipedia'], | |
'difficulty': 'hard' | |
}, | |
{ | |
'id': 'complex_002', | |
'question': 'What is 25 * 17, and in what year was the Eiffel Tower completed?', | |
'expected_answer': '425 and 1889', | |
'category': 'multi_step', | |
'tools_required': ['calculator', 'wikipedia'], | |
'difficulty': 'hard' | |
} | |
] | |
# Chess and game-related questions | |
self.chess_questions = [ | |
{ | |
'id': 'chess_001', | |
'question': 'In chess, what is the minimum number of moves required for checkmate?', | |
'expected_answer': '2', | |
'category': 'games', | |
'tools_required': ['wikipedia'], | |
'difficulty': 'medium' | |
}, | |
{ | |
'id': 'chess_002', | |
'question': 'How many squares are on a standard chess board?', | |
'expected_answer': '64', | |
'category': 'games', | |
'tools_required': ['calculator'], | |
'difficulty': 'easy' | |
} | |
] | |
# Edge cases and error handling | |
self.edge_case_questions = [ | |
{ | |
'id': 'edge_001', | |
'question': '', | |
'expected_answer': 'unknown', | |
'category': 'edge_case', | |
'tools_required': [], | |
'difficulty': 'easy' | |
}, | |
{ | |
'id': 'edge_002', | |
'question': 'What is the square root of -1?', | |
'expected_answer': None, # Should handle gracefully | |
'category': 'edge_case', | |
'tools_required': ['calculator'], | |
'difficulty': 'medium' | |
}, | |
{ | |
'id': 'edge_003', | |
'question': 'Calculate the factorial of -5', | |
'expected_answer': None, # Should handle gracefully | |
'category': 'edge_case', | |
'tools_required': ['python'], | |
'difficulty': 'medium' | |
} | |
] | |
def get_all_questions(self) -> List[Dict[str, Any]]: | |
"""Get all test questions combined.""" | |
all_questions = [] | |
all_questions.extend(self.mathematical_questions) | |
all_questions.extend(self.knowledge_questions) | |
all_questions.extend(self.file_based_questions) | |
all_questions.extend(self.multimodal_questions) | |
all_questions.extend(self.complex_questions) | |
all_questions.extend(self.chess_questions) | |
all_questions.extend(self.edge_case_questions) | |
return all_questions | |
def get_questions_by_category(self, category: str) -> List[Dict[str, Any]]: | |
"""Get questions filtered by category.""" | |
all_questions = self.get_all_questions() | |
return [q for q in all_questions if q['category'] == category] | |
def get_questions_by_difficulty(self, difficulty: str) -> List[Dict[str, Any]]: | |
"""Get questions filtered by difficulty.""" | |
all_questions = self.get_all_questions() | |
return [q for q in all_questions if q['difficulty'] == difficulty] | |
def get_questions_by_tools(self, tools: List[str]) -> List[Dict[str, Any]]: | |
"""Get questions that require specific tools.""" | |
all_questions = self.get_all_questions() | |
return [q for q in all_questions if any(tool in q['tools_required'] for tool in tools)] | |
def _create_python_code_file(self) -> str: | |
"""Create a Python code file for testing.""" | |
code_content = """#!/usr/bin/env python3 | |
# Test Python code for GAIA evaluation | |
def main(): | |
# Calculate 25 * 17 | |
result = 25 * 17 | |
print(f"The calculation result is: {result}") | |
return result | |
if __name__ == "__main__": | |
answer = main() | |
print(f"Final answer: {answer}") | |
""" | |
return code_content | |
def _create_csv_data_file(self) -> str: | |
"""Create a CSV data file for testing.""" | |
csv_content = """name,amount,category | |
item1,25,A | |
item2,50,B | |
item3,75,A | |
""" | |
return csv_content | |
def _create_json_data_file(self) -> str: | |
"""Create a JSON data file for testing.""" | |
json_data = { | |
"calculation": "16^2", | |
"result": 256, | |
"metadata": { | |
"timestamp": "2024-01-01T00:00:00Z", | |
"version": "1.0" | |
} | |
} | |
return json.dumps(json_data, indent=2) | |
def _create_image_description_file(self) -> str: | |
"""Create an image description file for testing.""" | |
description = """Image Description: | |
This image contains 3 distinct objects: | |
1. A red car in the foreground | |
2. A blue house in the background | |
3. A green tree on the right side | |
The image is taken during daytime with clear visibility. | |
Total objects visible: 3 | |
""" | |
return description | |
def _create_document_file(self) -> str: | |
"""Create a document file for testing.""" | |
document_content = """Research Paper: Artificial Intelligence in Modern Computing | |
Abstract: | |
This paper discusses the role of artificial intelligence in modern computing systems. | |
We explore machine learning algorithms, neural networks, and their applications | |
in various industries. | |
Introduction: | |
Artificial intelligence (AI) has become a cornerstone of modern technology. | |
From autonomous vehicles to recommendation systems, AI is transforming | |
how we interact with technology. | |
Main Topics: | |
1. Machine Learning Fundamentals | |
2. Deep Learning and Neural Networks | |
3. Natural Language Processing | |
4. Computer Vision Applications | |
Conclusion: | |
The future of computing is closely tied to advances in artificial intelligence. | |
As AI continues to evolve, we can expect even more innovative applications | |
across all sectors of technology. | |
""" | |
return document_content | |
class TestGAIAStyleQuestions: | |
"""Test suite for GAIA-style questions.""" | |
def setup_method(self): | |
"""Set up test fixtures.""" | |
self.gaia_questions = GAIAStyleTestQuestions() | |
self.agent = self.gaia_questions.agent | |
# Test metrics | |
self.test_results = { | |
'total_questions': 0, | |
'correct_answers': 0, | |
'failed_questions': [], | |
'category_performance': {}, | |
'difficulty_performance': {} | |
} | |
def test_mathematical_questions(self): | |
"""Test mathematical questions.""" | |
questions = self.gaia_questions.mathematical_questions | |
self._run_question_category(questions, 'mathematical') | |
def test_knowledge_questions(self): | |
"""Test knowledge questions.""" | |
questions = self.gaia_questions.knowledge_questions | |
self._run_question_category(questions, 'knowledge') | |
def test_file_based_questions(self): | |
"""Test file-based questions.""" | |
questions = self.gaia_questions.file_based_questions | |
self._run_question_category_with_files(questions, 'file_based') | |
def test_multimodal_questions(self): | |
"""Test multimodal questions.""" | |
questions = self.gaia_questions.multimodal_questions | |
self._run_question_category_with_files(questions, 'multimodal') | |
def test_complex_questions(self): | |
"""Test complex multi-step questions.""" | |
questions = self.gaia_questions.complex_questions | |
self._run_question_category(questions, 'complex') | |
def test_chess_questions(self): | |
"""Test chess and game-related questions.""" | |
questions = self.gaia_questions.chess_questions | |
self._run_question_category(questions, 'chess') | |
def test_edge_case_questions(self): | |
"""Test edge cases and error handling.""" | |
questions = self.gaia_questions.edge_case_questions | |
self._run_question_category(questions, 'edge_cases') | |
def test_overall_performance(self): | |
"""Test overall system performance across all question types.""" | |
all_questions = self.gaia_questions.get_all_questions() | |
# Run a subset of questions for performance testing | |
test_questions = all_questions[:10] # Test first 10 questions | |
for question_data in test_questions: | |
self._test_single_question(question_data) | |
# Calculate performance metrics | |
if self.test_results['total_questions'] > 0: | |
accuracy = self.test_results['correct_answers'] / self.test_results['total_questions'] | |
print(f"\nπ Overall Performance Metrics:") | |
print(f"Total Questions: {self.test_results['total_questions']}") | |
print(f"Correct Answers: {self.test_results['correct_answers']}") | |
print(f"Accuracy: {accuracy:.2%}") | |
# Assert minimum accuracy requirement | |
assert accuracy >= 0.7, f"Accuracy {accuracy:.2%} below minimum threshold of 70%" | |
print("β Overall performance test passed!") | |
def _run_question_category(self, questions: List[Dict[str, Any]], category_name: str): | |
"""Run tests for a category of questions.""" | |
if not self.agent.available: | |
pytest.skip(f"Agent not available for {category_name} questions") | |
category_correct = 0 | |
category_total = 0 | |
for question_data in questions: | |
result = self._test_single_question(question_data) | |
category_total += 1 | |
if result: | |
category_correct += 1 | |
# Store category performance | |
if category_total > 0: | |
category_accuracy = category_correct / category_total | |
self.test_results['category_performance'][category_name] = { | |
'correct': category_correct, | |
'total': category_total, | |
'accuracy': category_accuracy | |
} | |
print(f"π {category_name.title()} Questions: {category_correct}/{category_total} ({category_accuracy:.2%})") | |
def _run_question_category_with_files(self, questions: List[Dict[str, Any]], category_name: str): | |
"""Run tests for a category of questions that require files.""" | |
if not self.agent.available: | |
pytest.skip(f"Agent not available for {category_name} questions") | |
category_correct = 0 | |
category_total = 0 | |
for question_data in questions: | |
# Create temporary file with content | |
if 'file_content' in question_data: | |
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') | |
temp_file.write(question_data['file_content']) | |
temp_file.close() | |
try: | |
result = self._test_single_question_with_file(question_data, temp_file.name) | |
category_total += 1 | |
if result: | |
category_correct += 1 | |
finally: | |
# Clean up temporary file | |
try: | |
os.unlink(temp_file.name) | |
except OSError: | |
pass | |
else: | |
result = self._test_single_question(question_data) | |
category_total += 1 | |
if result: | |
category_correct += 1 | |
# Store category performance | |
if category_total > 0: | |
category_accuracy = category_correct / category_total | |
self.test_results['category_performance'][category_name] = { | |
'correct': category_correct, | |
'total': category_total, | |
'accuracy': category_accuracy | |
} | |
print(f"π {category_name.title()} Questions: {category_correct}/{category_total} ({category_accuracy:.2%})") | |
def _test_single_question(self, question_data: Dict[str, Any]) -> bool: | |
"""Test a single question and return success status.""" | |
question_id = question_data['id'] | |
question = question_data['question'] | |
expected = question_data.get('expected_answer') | |
self.test_results['total_questions'] += 1 | |
try: | |
# Get answer from agent | |
answer = self.agent(question) | |
# Validate answer | |
if expected is not None: | |
success = self._validate_answer(answer, expected, question_data.get('category', '')) | |
else: | |
# For questions without expected answers, just check that we got a reasonable response | |
success = answer is not None and answer != "unknown" and len(answer.strip()) > 0 | |
if success: | |
self.test_results['correct_answers'] += 1 | |
print(f"β {question_id}: {question} β {answer}") | |
return True | |
else: | |
self.test_results['failed_questions'].append({ | |
'id': question_id, | |
'question': question, | |
'expected': expected, | |
'actual': answer | |
}) | |
print(f"β {question_id}: {question} β Expected: {expected}, Got: {answer}") | |
return False | |
except Exception as e: | |
self.test_results['failed_questions'].append({ | |
'id': question_id, | |
'question': question, | |
'expected': expected, | |
'error': str(e) | |
}) | |
print(f"π₯ {question_id}: {question} β Error: {e}") | |
return False | |
def _test_single_question_with_file(self, question_data: Dict[str, Any], file_path: str) -> bool: | |
"""Test a single question with a file attachment.""" | |
question_id = question_data['id'] | |
question = question_data['question'] | |
expected = question_data.get('expected_answer') | |
self.test_results['total_questions'] += 1 | |
try: | |
# Get answer from agent with file | |
answer = self.agent(question, [file_path]) | |
# Validate answer | |
if expected is not None: | |
success = self._validate_answer(answer, expected, question_data.get('category', '')) | |
else: | |
# For questions without expected answers, just check that we got a reasonable response | |
success = answer is not None and answer != "unknown" and len(answer.strip()) > 0 | |
if success: | |
self.test_results['correct_answers'] += 1 | |
print(f"β {question_id}: {question} (with file) β {answer}") | |
return True | |
else: | |
self.test_results['failed_questions'].append({ | |
'id': question_id, | |
'question': question, | |
'expected': expected, | |
'actual': answer, | |
'file': file_path | |
}) | |
print(f"β {question_id}: {question} (with file) β Expected: {expected}, Got: {answer}") | |
return False | |
except Exception as e: | |
self.test_results['failed_questions'].append({ | |
'id': question_id, | |
'question': question, | |
'expected': expected, | |
'error': str(e), | |
'file': file_path | |
}) | |
print(f"π₯ {question_id}: {question} (with file) β Error: {e}") | |
return False | |
def _validate_answer(self, actual: str, expected: str, category: str) -> bool: | |
"""Validate an answer against expected result.""" | |
if not actual or actual == "unknown": | |
return False | |
# Clean up answers for comparison | |
actual_clean = actual.strip().lower() | |
expected_clean = expected.strip().lower() | |
# Exact match | |
if actual_clean == expected_clean: | |
return True | |
# For numeric answers, try numeric comparison | |
if category in ['basic_math', 'advanced_math', 'data_analysis', 'code_execution']: | |
try: | |
actual_num = float(actual.replace(',', '')) | |
expected_num = float(expected.replace(',', '')) | |
return abs(actual_num - expected_num) < 0.01 | |
except ValueError: | |
pass | |
# For text answers, allow partial matches | |
if category in ['geography', 'history', 'document_analysis']: | |
return expected_clean in actual_clean or actual_clean in expected_clean | |
return False | |
if __name__ == "__main__": | |
# Run the GAIA-style question tests | |
pytest.main([__file__, "-v", "--tb=short"]) |