File size: 4,495 Bytes
d227e0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
"""
GAIA Sample Tasks for Testing the AI Agent
This file contains sample tasks from the GAIA benchmark categories
to test the agent's capabilities across different skills.
"""
# Sample GAIA tasks for testing the agent
GAIA_SAMPLE_TASKS = [
# Reasoning tasks
{
"category": "reasoning",
"difficulty": "easy",
"task": "If a train travels at 60 miles per hour, how far will it travel in 2.5 hours?"
},
{
"category": "reasoning",
"difficulty": "medium",
"task": "A store is having a 30% off sale. If an item originally costs $85, what is the sale price? Additionally, if there's a 8% sales tax, what is the final price?"
},
{
"category": "reasoning",
"difficulty": "hard",
"task": "In a class of 30 students, 40% are boys. If 3 more girls join the class, what percentage of the class will be boys?"
},
# Web search and information retrieval tasks
{
"category": "web_search",
"difficulty": "easy",
"task": "What is the capital of Japan and what is its population?"
},
{
"category": "web_search",
"difficulty": "medium",
"task": "Who won the Nobel Prize in Physics in 2023? What was their contribution?"
},
{
"category": "web_search",
"difficulty": "hard",
"task": "Compare and contrast the climate policies of the United States and the European Union. What are the key differences in their approaches to reducing carbon emissions?"
},
# Multimodal understanding tasks (would require image input in a real scenario)
{
"category": "multimodal",
"difficulty": "easy",
"task": "Analyze this image URL and describe what you see: https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa%2C_by_Leonardo_da_Vinci%2C_from_C2RMF_retouched.jpg/800px-Mona_Lisa%2C_by_Leonardo_da_Vinci%2C_from_C2RMF_retouched.jpg"
},
{
"category": "multimodal",
"difficulty": "medium",
"task": "Look at this chart image and explain the trend: https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Global-surface-temperature.svg/1200px-Global-surface-temperature.svg.png"
},
# Tool usage tasks
{
"category": "tool_usage",
"difficulty": "easy",
"task": "Write a Python function to calculate the factorial of a number, then use it to find the factorial of 5."
},
{
"category": "tool_usage",
"difficulty": "medium",
"task": "Create a Python script that fetches the current weather for New York City using a weather API and displays the temperature, humidity, and weather conditions."
},
{
"category": "tool_usage",
"difficulty": "hard",
"task": "Write a Python script that analyzes a text file containing a list of numbers (one per line), calculates the mean, median, mode, and standard deviation, and creates a histogram visualization of the data."
},
# Combined skills tasks
{
"category": "combined",
"difficulty": "medium",
"task": "Research the top 3 electric vehicle manufacturers by market share. Create a Python script to visualize their market shares in a pie chart."
},
{
"category": "combined",
"difficulty": "hard",
"task": "Find information about global coffee production by country for the last year. Write a Python script to create a bar chart showing the top 5 coffee-producing countries and their production volumes."
}
]
# Function to get tasks by category
def get_tasks_by_category(category):
return [task for task in GAIA_SAMPLE_TASKS if task["category"] == category]
# Function to get tasks by difficulty
def get_tasks_by_difficulty(difficulty):
return [task for task in GAIA_SAMPLE_TASKS if task["difficulty"] == difficulty]
# Function to get all task queries as a list
def get_all_task_queries():
return [task["task"] for task in GAIA_SAMPLE_TASKS]
# Function to get a subset of tasks for quick testing
def get_quick_test_tasks():
# One task from each category and difficulty level
quick_test_tasks = [
GAIA_SAMPLE_TASKS[0], # reasoning, easy
GAIA_SAMPLE_TASKS[3], # web_search, easy
GAIA_SAMPLE_TASKS[6], # multimodal, easy
GAIA_SAMPLE_TASKS[9], # tool_usage, medium
GAIA_SAMPLE_TASKS[11] # combined, medium
]
return [task["task"] for task in quick_test_tasks]
|