final / gaia_sample_tasks.py
yoshizen's picture
Upload 4 files
d227e0d verified
"""
GAIA Sample Tasks for Testing the AI Agent
This file contains sample tasks from the GAIA benchmark categories
to test the agent's capabilities across different skills.
"""
# Sample GAIA tasks for testing the agent
GAIA_SAMPLE_TASKS = [
# Reasoning tasks
{
"category": "reasoning",
"difficulty": "easy",
"task": "If a train travels at 60 miles per hour, how far will it travel in 2.5 hours?"
},
{
"category": "reasoning",
"difficulty": "medium",
"task": "A store is having a 30% off sale. If an item originally costs $85, what is the sale price? Additionally, if there's a 8% sales tax, what is the final price?"
},
{
"category": "reasoning",
"difficulty": "hard",
"task": "In a class of 30 students, 40% are boys. If 3 more girls join the class, what percentage of the class will be boys?"
},
# Web search and information retrieval tasks
{
"category": "web_search",
"difficulty": "easy",
"task": "What is the capital of Japan and what is its population?"
},
{
"category": "web_search",
"difficulty": "medium",
"task": "Who won the Nobel Prize in Physics in 2023? What was their contribution?"
},
{
"category": "web_search",
"difficulty": "hard",
"task": "Compare and contrast the climate policies of the United States and the European Union. What are the key differences in their approaches to reducing carbon emissions?"
},
# Multimodal understanding tasks (would require image input in a real scenario)
{
"category": "multimodal",
"difficulty": "easy",
"task": "Analyze this image URL and describe what you see: https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa%2C_by_Leonardo_da_Vinci%2C_from_C2RMF_retouched.jpg/800px-Mona_Lisa%2C_by_Leonardo_da_Vinci%2C_from_C2RMF_retouched.jpg"
},
{
"category": "multimodal",
"difficulty": "medium",
"task": "Look at this chart image and explain the trend: https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Global-surface-temperature.svg/1200px-Global-surface-temperature.svg.png"
},
# Tool usage tasks
{
"category": "tool_usage",
"difficulty": "easy",
"task": "Write a Python function to calculate the factorial of a number, then use it to find the factorial of 5."
},
{
"category": "tool_usage",
"difficulty": "medium",
"task": "Create a Python script that fetches the current weather for New York City using a weather API and displays the temperature, humidity, and weather conditions."
},
{
"category": "tool_usage",
"difficulty": "hard",
"task": "Write a Python script that analyzes a text file containing a list of numbers (one per line), calculates the mean, median, mode, and standard deviation, and creates a histogram visualization of the data."
},
# Combined skills tasks
{
"category": "combined",
"difficulty": "medium",
"task": "Research the top 3 electric vehicle manufacturers by market share. Create a Python script to visualize their market shares in a pie chart."
},
{
"category": "combined",
"difficulty": "hard",
"task": "Find information about global coffee production by country for the last year. Write a Python script to create a bar chart showing the top 5 coffee-producing countries and their production volumes."
}
]
# Function to get tasks by category
def get_tasks_by_category(category):
return [task for task in GAIA_SAMPLE_TASKS if task["category"] == category]
# Function to get tasks by difficulty
def get_tasks_by_difficulty(difficulty):
return [task for task in GAIA_SAMPLE_TASKS if task["difficulty"] == difficulty]
# Function to get all task queries as a list
def get_all_task_queries():
return [task["task"] for task in GAIA_SAMPLE_TASKS]
# Function to get a subset of tasks for quick testing
def get_quick_test_tasks():
# One task from each category and difficulty level
quick_test_tasks = [
GAIA_SAMPLE_TASKS[0], # reasoning, easy
GAIA_SAMPLE_TASKS[3], # web_search, easy
GAIA_SAMPLE_TASKS[6], # multimodal, easy
GAIA_SAMPLE_TASKS[9], # tool_usage, medium
GAIA_SAMPLE_TASKS[11] # combined, medium
]
return [task["task"] for task in quick_test_tasks]