File size: 11,321 Bytes
c922f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
"""
Assessment analyzer for the GAIA agent.

This module provides functionality to analyze assessment results,
categorize questions, calculate success rates, identify error patterns,
and generate recommendations for further improvements.
"""

import logging
from typing import Dict, Any, List, Optional, Set, Tuple
import re
from collections import Counter, defaultdict

logger = logging.getLogger("gaia_agent.assessment_analyzer")

class AnalysisResult:
    """Class to store assessment analysis results."""
    
    def __init__(
        self,
        category: str,
        success_rate: float,
        successful_questions: Dict[str, Dict[str, Any]],
        failed_questions: Dict[str, Dict[str, Any]],
        error_patterns: Dict[str, int],
        recommendations: List[str]
    ):
        """
        Initialize the analysis result.
        
        Args:
            category: The category of questions analyzed
            success_rate: The success rate (0.0 to 1.0)
            successful_questions: Dictionary of successfully answered questions
            failed_questions: Dictionary of failed questions
            error_patterns: Dictionary of error patterns and their frequencies
            recommendations: List of recommendations for improvement
        """
        self.category = category
        self.success_rate = success_rate
        self.successful_questions = successful_questions
        self.failed_questions = failed_questions
        self.error_patterns = error_patterns
        self.recommendations = recommendations
    
    def __str__(self) -> str:
        """Return a string representation of the analysis result."""
        return (
            f"Analysis Result for {self.category} questions:\n"
            f"Success Rate: {self.success_rate:.2%}\n"
            f"Successful Questions: {len(self.successful_questions)}\n"
            f"Failed Questions: {len(self.failed_questions)}\n"
            f"Top Error Patterns: {dict(sorted(self.error_patterns.items(), key=lambda x: x[1], reverse=True)[:3])}\n"
            f"Recommendations: {self.recommendations}"
        )

def categorize_question(question_id: str, question_data: Dict[str, Any]) -> str:
    """
    Categorize a question based on its content and ID.
    
    Args:
        question_id: The ID of the question
        question_data: The question data
        
    Returns:
        The category of the question
    """
    question_text = question_data.get("question", "").lower()
    
    # Web search questions
    web_search_keywords = [
        "mercedes sosa", "wikipedia", "featured article", "malko competition",
        "yankees", "olympics", "universe today", "polish tv actor"
    ]
    
    # Multimodal questions
    multimodal_keywords = [
        "video", "youtube", "watch?v=", "audio", "listen", "chess position", "image"
    ]
    
    # File processing questions
    file_processing_keywords = [
        "excel", "spreadsheet", "python code", "execute", "run this code"
    ]
    
    # Academic paper questions
    academic_paper_keywords = [
        "arxiv", "paper", "scientific", "research", "publication", "journal"
    ]
    
    # Mathematical reasoning questions
    math_keywords = [
        "math", "calculate", "commutativity", "table", "operation", "subset"
    ]
    
    # Check for web search questions
    for keyword in web_search_keywords:
        if keyword in question_text:
            return "web_search"
    
    # Check for multimodal questions
    for keyword in multimodal_keywords:
        if keyword in question_text:
            return "multimodal"
    
    # Check for file processing questions
    for keyword in file_processing_keywords:
        if keyword in question_text:
            return "file_processing"
    
    # Check for academic paper questions
    for keyword in academic_paper_keywords:
        if keyword in question_text:
            return "academic_paper"
    
    # Check for mathematical reasoning questions
    for keyword in math_keywords:
        if keyword in question_text:
            return "mathematical_reasoning"
    
    # Default to "other" if no category matches
    return "other"

def identify_error_patterns(failed_questions: Dict[str, Dict[str, Any]]) -> Dict[str, int]:
    """
    Identify common error patterns in failed questions.
    
    Args:
        failed_questions: Dictionary of failed questions
        
    Returns:
        Dictionary of error patterns and their frequencies
    """
    error_patterns = Counter()
    
    for question_id, question_data in failed_questions.items():
        error = question_data.get("error", "")
        answer = question_data.get("answer", "")
        expected_answer = question_data.get("expected_answer", "")
        
        # Check for specific error types
        if error:
            # Check for timeout-related errors
            if any(term in error.lower() for term in ["timeout", "timed out", "time limit"]):
                error_patterns["timeout"] += 1
            # Check for API-related errors
            elif any(term in error.lower() for term in ["api", "rate limit", "quota"]):
                error_patterns["api_error"] += 1
            # Check for unhashable type errors
            elif "unhashable" in error.lower():
                error_patterns["unhashable_type"] += 1
            else:
                error_patterns["other_error"] += 1
        # Check for incorrect answers
        elif answer and expected_answer and answer != expected_answer:
            # Check for partial matches
            if any(word in answer.lower() for word in expected_answer.lower().split()):
                error_patterns["partial_match"] += 1
            else:
                error_patterns["completely_wrong_answer"] += 1
        else:
            error_patterns["unknown_failure"] += 1
    
    return error_patterns

def generate_recommendations(
    category: str,
    success_rate: float,
    error_patterns: Dict[str, int],
    failed_questions: Dict[str, Dict[str, Any]]
) -> List[str]:
    """
    Generate recommendations for improvement based on analysis.
    
    Args:
        category: The category of questions
        success_rate: The success rate
        error_patterns: Dictionary of error patterns and their frequencies
        failed_questions: Dictionary of failed questions
        
    Returns:
        List of recommendations
    """
    recommendations = []
    
    # General recommendations based on success rate
    if success_rate < 0.3:
        recommendations.append(f"Major improvements needed for {category} questions")
    elif success_rate < 0.7:
        recommendations.append(f"Moderate improvements needed for {category} questions")
    elif success_rate < 1.0:
        recommendations.append(f"Minor improvements needed for {category} questions")
    else:
        recommendations.append(f"All {category} questions are answered correctly")
    
    # Specific recommendations based on error patterns
    if error_patterns.get("timeout", 0) > 0:
        recommendations.append("Optimize search performance to prevent timeouts")
    
    if error_patterns.get("api_error", 0) > 0:
        recommendations.append("Improve error handling for API failures")
    
    if error_patterns.get("unhashable_type", 0) > 0:
        recommendations.append("Fix unhashable type errors in the agent implementation")
    
    if error_patterns.get("partial_match", 0) > 0:
        recommendations.append("Improve answer extraction precision")
    
    if error_patterns.get("completely_wrong_answer", 0) > 0:
        recommendations.append("Enhance search result relevance filtering")
    
    # Category-specific recommendations
    if category == "web_search":
        if success_rate < 1.0:
            recommendations.append("Implement specialized search for specific domains (Wikipedia, news, etc.)")
            recommendations.append("Improve result ranking based on query relevance")
            recommendations.append("Add better error handling for failed searches")
    
    elif category == "multimodal":
        if success_rate < 1.0:
            recommendations.append("Enhance video content extraction capabilities")
            recommendations.append("Improve audio transcription accuracy")
            recommendations.append("Add specialized image analysis for specific domains")
    
    elif category == "file_processing":
        if success_rate < 1.0:
            recommendations.append("Improve Excel file parsing and data extraction")
            recommendations.append("Enhance code execution sandbox for better isolation")
    
    elif category == "academic_paper":
        if success_rate < 1.0:
            recommendations.append("Improve arXiv paper content extraction")
            recommendations.append("Add support for more academic paper sources")
    
    elif category == "mathematical_reasoning":
        if success_rate < 1.0:
            recommendations.append("Enhance mathematical operation analysis")
            recommendations.append("Improve structured data parsing for tables and matrices")
    
    return recommendations

def analyze_assessment_results(
    assessment_results: Dict[str, Dict[str, Any]],
    category: Optional[str] = None
) -> AnalysisResult:
    """
    Analyze assessment results and generate insights.
    
    Args:
        assessment_results: Dictionary of assessment results
        category: Optional category to filter questions by
        
    Returns:
        AnalysisResult object with analysis results
    """
    # Categorize questions
    categorized_questions = defaultdict(dict)
    
    for question_id, question_data in assessment_results.items():
        question_category = categorize_question(question_id, question_data)
        categorized_questions[question_category][question_id] = question_data
    
    # If a specific category is requested, filter questions
    if category:
        questions_to_analyze = categorized_questions.get(category, {})
        category_name = category
    else:
        questions_to_analyze = assessment_results
        category_name = "all"
    
    # Separate successful and failed questions
    successful_questions = {}
    failed_questions = {}
    
    for question_id, question_data in questions_to_analyze.items():
        if question_data.get("correct", False):
            successful_questions[question_id] = question_data
        else:
            failed_questions[question_id] = question_data
    
    # Calculate success rate
    total_questions = len(questions_to_analyze)
    successful_count = len(successful_questions)
    
    success_rate = successful_count / total_questions if total_questions > 0 else 0.0
    
    # Identify error patterns
    error_patterns = identify_error_patterns(failed_questions)
    
    # Generate recommendations
    recommendations = generate_recommendations(
        category_name,
        success_rate,
        error_patterns,
        failed_questions
    )
    
    # Create and return analysis result
    return AnalysisResult(
        category=category_name,
        success_rate=success_rate,
        successful_questions=successful_questions,
        failed_questions=failed_questions,
        error_patterns=error_patterns,
        recommendations=recommendations
    )