Ashokdll commited on
Commit
6379cef
·
verified ·
1 Parent(s): 47415c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +805 -775
app.py CHANGED
@@ -1,845 +1,875 @@
1
- import requests
 
 
 
 
 
 
 
 
 
 
2
  import json
3
- import re
4
- import ast
5
- import operator
6
- from typing import Dict, List, Any, Optional
7
  import time
 
 
 
 
 
 
8
 
9
- class GAIAAgent:
10
- def __init__(self, api_base_url: str):
11
- self.api_base_url = api_base_url
12
- self.tools = self._initialize_tools()
13
- self.max_retries = 3
14
- self.timeout = 30
15
-
16
- def _initialize_tools(self):
17
- """Initialize all available tools"""
18
- return {
19
- 'web_search': WebSearchTool(),
20
- 'calculator': CalculatorTool(),
21
- 'file_processor': FileProcessorTool(self.api_base_url),
22
- 'text_analyzer': TextAnalyzerTool()
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- def solve_question(self, question_data: Dict) -> str:
26
- """Main pipeline to solve a GAIA question"""
27
- try:
28
- # Step 1: Analyze the question
29
- analysis = self._analyze_question(question_data)
30
- print(f"Question analysis: {analysis}")
31
-
32
- # Step 2: Create execution plan
33
- plan = self._create_execution_plan(analysis, question_data)
34
- print(f"Execution plan: {[step['action'] for step in plan]}")
35
-
36
- # Step 3: Execute plan
37
- results = self._execute_plan(plan, question_data)
38
- print(f"Execution results keys: {list(results.keys())}")
39
-
40
- # Step 4: Generate final answer
41
- final_answer = self._generate_final_answer(results, question_data)
42
-
43
- # Step 5: Format and validate answer
44
- formatted_answer = self._format_final_answer(final_answer)
45
-
46
- return formatted_answer
 
 
 
47
 
48
- except Exception as e:
49
- print(f"Error solving question: {e}")
50
- return "Unable to determine answer"
51
-
52
- def _analyze_question(self, question_data: Dict) -> Dict:
53
- """Analyze question to determine approach and required tools"""
54
- question = question_data.get('question', '')
55
- has_file = bool(question_data.get('file_name'))
56
-
57
- # Classify question type
58
- question_lower = question.lower()
59
-
60
- analysis = {
61
- 'needs_calculation': any(word in question_lower for word in
62
- ['calculate', 'compute', 'sum', 'total', 'average', 'count', 'multiply', 'divide']),
63
- 'needs_web_search': any(word in question_lower for word in
64
- ['who', 'what', 'when', 'where', 'find', 'search', 'latest', 'current']),
65
- 'needs_file_processing': has_file,
66
- 'is_factual_question': any(word in question_lower for word in
67
- ['who is', 'what is', 'when was', 'where is']),
68
- 'needs_analysis': any(word in question_lower for word in
69
- ['analyze', 'compare', 'determine', 'evaluate']),
70
- 'question_text': question,
71
- 'has_file': has_file,
72
- 'file_name': question_data.get('file_name', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  }
74
-
75
- return analysis
76
-
77
- def _create_execution_plan(self, analysis: Dict, question_data: Dict) -> List[Dict]:
78
- """Create step-by-step execution plan"""
79
- plan = []
80
-
81
- # Priority 1: Process files if they exist
82
- if analysis['needs_file_processing']:
83
- plan.append({
84
- 'action': 'process_file',
85
- 'tool': 'file_processor',
86
- 'priority': 1,
87
- 'params': {
88
- 'task_id': question_data.get('task_id'),
89
- 'file_name': question_data.get('file_name')
90
- }
91
- })
92
-
93
- # Priority 2: Web search for factual information
94
- if analysis['needs_web_search'] or analysis['is_factual_question']:
95
- plan.append({
96
- 'action': 'web_search',
97
- 'tool': 'web_search',
98
- 'priority': 2,
99
- 'params': {
100
- 'query': self._extract_search_query(analysis['question_text'])
101
- }
102
- })
103
-
104
- # Priority 3: Calculations
105
- if analysis['needs_calculation']:
106
- plan.append({
107
- 'action': 'calculate',
108
- 'tool': 'calculator',
109
- 'priority': 3,
110
- 'params': {}
111
- })
112
-
113
- # Priority 4: Text analysis
114
- plan.append({
115
- 'action': 'analyze_text',
116
- 'tool': 'text_analyzer',
117
- 'priority': 4,
118
- 'params': {
119
- 'text': analysis['question_text']
120
- }
121
- })
122
-
123
- return sorted(plan, key=lambda x: x['priority'])
124
-
125
- def _execute_plan(self, plan: List[Dict], question_data: Dict) -> Dict:
126
- """Execute the planned steps"""
127
- results = {}
128
-
129
- for step in plan:
130
- tool_name = step['tool']
131
- action = step['action']
132
 
133
- try:
134
- print(f"Executing: {action}")
 
 
135
 
136
- if action == 'process_file':
137
- results['file_data'] = self.tools[tool_name].process_file(
138
- step['params']['task_id'],
139
- step['params']['file_name']
140
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- elif action == 'web_search':
143
- results['search_data'] = self.tools[tool_name].search(
144
- step['params']['query']
145
- )
146
-
147
- elif action == 'calculate':
148
- # Extract numbers and operations from question and file data
149
- calculation_input = self._prepare_calculation_input(
150
- question_data, results
151
- )
152
- if calculation_input:
153
- results['calculation'] = self.tools[tool_name].calculate(
154
- calculation_input
155
- )
156
-
157
- elif action == 'analyze_text':
158
- results['text_analysis'] = self.tools[tool_name].analyze(
159
- step['params']['text'],
160
- context=results
161
- )
162
-
163
- except Exception as e:
164
- print(f"Error in {action}: {e}")
165
- results[f'{action}_error'] = str(e)
166
-
167
- return results
168
-
169
- def _extract_search_query(self, question: str) -> str:
170
- """Extract relevant search query from question"""
171
- # Remove question words and extract key terms
172
- question_words = ['what', 'who', 'when', 'where', 'how', 'why', 'is', 'are', 'was', 'were']
173
- words = question.lower().split()
174
-
175
- # Keep important words, remove common question words
176
- filtered_words = [word for word in words if word not in question_words and len(word) > 2]
177
-
178
- return ' '.join(filtered_words[:6]) # Limit to 6 words
179
-
180
- def _prepare_calculation_input(self, question_data: Dict, results: Dict) -> Optional[str]:
181
- """Prepare input for calculator based on question and available data"""
182
- question = question_data.get('question', '')
183
-
184
- # Extract numbers from question
185
- numbers = re.findall(r'\d+\.?\d*', question)
186
-
187
- # Look for mathematical operations
188
- if 'sum' in question.lower() or 'total' in question.lower():
189
- if numbers:
190
- return '+'.join(numbers)
191
- elif 'multiply' in question.lower() or 'product' in question.lower():
192
- if numbers:
193
- return '*'.join(numbers)
194
- elif 'average' in question.lower():
195
- if numbers:
196
- return f"({'+'.join(numbers)})/{len(numbers)}"
197
-
198
- # Check if file data contains numbers for calculation
199
- if 'file_data' in results and isinstance(results['file_data'], dict):
200
- file_numbers = results['file_data'].get('numbers', [])
201
- if file_numbers and ('sum' in question.lower() or 'total' in question.lower()):
202
- return '+'.join(map(str, file_numbers))
203
-
204
- return None
205
-
206
- def _generate_final_answer(self, results: Dict, question_data: Dict) -> str:
207
- """Generate final answer based on execution results"""
208
- question = question_data.get('question', '').lower()
209
-
210
- # Priority order for answer selection
211
- if 'calculation' in results and results['calculation'] is not None:
212
- return str(results['calculation'])
213
-
214
- if 'file_data' in results and isinstance(results['file_data'], dict):
215
- # Look for specific answer in file data
216
- if 'answer' in results['file_data']:
217
- return str(results['file_data']['answer'])
218
- elif 'summary' in results['file_data']:
219
- return str(results['file_data']['summary'])
220
-
221
- if 'search_data' in results and results['search_data']:
222
- # Extract answer from search results
223
- for result in results['search_data']:
224
- if isinstance(result, dict) and 'summary' in result:
225
- return result['summary']
226
-
227
- if 'text_analysis' in results:
228
- return str(results['text_analysis'])
229
-
230
- return "Unable to determine answer"
231
 
232
- def _format_final_answer(self, answer: str) -> str:
233
- """Format the final answer for exact match scoring"""
234
- if not answer:
235
- return "No answer found"
236
 
237
- # Convert to string and strip whitespace
238
- answer = str(answer).strip()
239
-
240
- # Remove common prefixes that might cause exact match failures
241
- prefixes_to_remove = [
242
- 'the answer is: ',
243
- 'answer: ',
244
- 'final answer: ',
245
- 'result: ',
246
- 'solution: '
247
- ]
248
-
249
- answer_lower = answer.lower()
250
- for prefix in prefixes_to_remove:
251
- if answer_lower.startswith(prefix):
252
- answer = answer[len(prefix):].strip()
253
- break
254
-
255
- # Handle numeric answers
256
- if self._is_numeric_answer(answer):
257
- return self._format_numeric_answer(answer)
258
-
259
- # Handle yes/no answers
260
- if answer.lower() in ['yes', 'no', 'true', 'false']:
261
- return answer.lower()
262
-
263
- # Return cleaned text answer
264
- return answer
265
-
266
- def _is_numeric_answer(self, answer: str) -> bool:
267
- """Check if answer is numeric"""
268
  try:
269
- float(answer)
270
- return True
271
- except ValueError:
272
- return False
273
-
274
- def _format_numeric_answer(self, answer: str) -> str:
275
- """Format numeric answers consistently"""
276
- try:
277
- num = float(answer)
278
- if num.is_integer():
279
- return str(int(num))
280
- else:
281
- # Round to 6 decimal places to avoid floating point issues
282
- return str(round(num, 6)).rstrip('0').rstrip('.')
283
- except ValueError:
284
- return answer
285
-
286
-
287
- class WebSearchTool:
288
- """Simple web search tool (implement with your preferred search API)"""
289
-
290
- def search(self, query: str, max_results: int = 3) -> List[Dict]:
291
- """Perform web search - implement with your preferred search service"""
292
- print(f"Web search: {query}")
293
-
294
- # Placeholder implementation
295
- # Replace with actual search API (DuckDuckGo, Google Custom Search, etc.)
296
- return [
297
- {
298
- 'title': f'Search result for: {query}',
299
- 'summary': f'Information about {query}',
300
- 'url': 'https://example.com'
301
- }
302
- ]
303
 
 
 
 
304
 
305
- class CalculatorTool:
306
- """Safe calculator for mathematical expressions"""
307
 
308
- def calculate(self, expression: str) -> Optional[float]:
309
- """Safely evaluate mathematical expressions"""
 
310
  try:
311
- # Remove whitespace
312
- expression = expression.replace(' ', '')
313
 
314
- # Basic safety check
315
- allowed_chars = set('0123456789+-*/().e')
316
- if not all(c in allowed_chars for c in expression):
317
- raise ValueError("Invalid characters in expression")
318
 
319
- # Use ast for safe evaluation
320
- node = ast.parse(expression, mode='eval')
321
- result = self._eval_node(node.body)
 
 
 
 
 
 
 
322
 
323
- return result
 
324
 
325
  except Exception as e:
326
- print(f"Calculation error: {e}")
327
- return None
328
 
329
- def _eval_node(self, node):
330
- """Recursively evaluate AST node"""
331
- if isinstance(node, ast.Constant):
332
- return node.value
333
- elif isinstance(node, ast.Num): # Python < 3.8
334
- return node.n
335
- elif isinstance(node, ast.BinOp):
336
- left = self._eval_node(node.left)
337
- right = self._eval_node(node.right)
338
-
339
- if isinstance(node.op, ast.Add):
340
- return left + right
341
- elif isinstance(node.op, ast.Sub):
342
- return left - right
343
- elif isinstance(node.op, ast.Mult):
344
- return left * right
345
- elif isinstance(node.op, ast.Div):
346
- return left / right
347
- elif isinstance(node.op, ast.Pow):
348
- return left ** right
349
- elif isinstance(node, ast.UnaryOp):
350
- operand = self._eval_node(node.operand)
351
- if isinstance(node.op, ast.USub):
352
- return -operand
353
- elif isinstance(node.op, ast.UAdd):
354
- return +operand
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
- raise ValueError(f"Unsupported operation: {type(node)}")
357
 
 
 
 
358
 
359
- class FileProcessorTool:
360
- """Tool for processing files from GAIA tasks"""
361
-
362
- def __init__(self, api_base_url: str):
363
- self.api_base_url = api_base_url
364
 
365
- def process_file(self, task_id: str, file_name: str) -> Dict:
366
- """Process file associated with a task"""
367
- try:
368
- # Download file
369
- file_content = self._download_file(task_id)
370
-
371
- # Process based on file extension
372
- if file_name.endswith('.csv'):
373
- return self._process_csv(file_content)
374
- elif file_name.endswith('.txt'):
375
- return self._process_text(file_content)
376
- elif file_name.endswith('.json'):
377
- return self._process_json(file_content)
378
- else:
379
- return self._process_generic(file_content)
380
-
381
- except Exception as e:
382
- print(f"File processing error: {e}")
383
- return {'error': str(e)}
384
-
385
- def _download_file(self, task_id: str) -> bytes:
386
- """Download file from API"""
387
- response = requests.get(f"{self.api_base_url}/files/{task_id}")
388
- response.raise_for_status()
389
- return response.content
390
-
391
- def _process_csv(self, content: bytes) -> Dict:
392
- """Process CSV file"""
393
  try:
394
- import io
395
- import csv
396
-
397
- # Convert bytes to string
398
- text_content = content.decode('utf-8')
399
-
400
- # Parse CSV
401
- reader = csv.reader(io.StringIO(text_content))
402
- rows = list(reader)
403
-
404
- if not rows:
405
- return {'error': 'Empty CSV file'}
406
-
407
- headers = rows[0] if rows else []
408
- data_rows = rows[1:] if len(rows) > 1 else []
409
-
410
- # Extract numbers for potential calculations
411
- numbers = []
412
- for row in data_rows:
413
- for cell in row:
414
- try:
415
- numbers.append(float(cell))
416
- except ValueError:
417
- continue
418
-
419
- return {
420
- 'type': 'csv',
421
- 'headers': headers,
422
- 'rows': data_rows,
423
- 'row_count': len(data_rows),
424
- 'numbers': numbers,
425
- 'summary': f'CSV with {len(headers)} columns and {len(data_rows)} rows'
426
- }
427
 
428
- except Exception as e:
429
- return {'error': f'CSV processing failed: {e}'}
430
-
431
- def _process_text(self, content: bytes) -> Dict:
432
- """Process text file"""
433
- try:
434
- text = content.decode('utf-8')
435
-
436
- # Extract numbers from text
437
- numbers = [float(match) for match in re.findall(r'\d+\.?\d*', text)]
438
-
439
- # Basic text analysis
440
- lines = text.split('\n')
441
- words = text.split()
442
-
443
- return {
444
- 'type': 'text',
445
- 'content': text,
446
- 'line_count': len(lines),
447
- 'word_count': len(words),
448
- 'numbers': numbers,
449
- 'summary': f'Text file with {len(lines)} lines and {len(words)} words'
450
- }
451
 
452
- except Exception as e:
453
- return {'error': f'Text processing failed: {e}'}
454
-
455
- def _process_json(self, content: bytes) -> Dict:
456
- """Process JSON file"""
457
- try:
458
- data = json.loads(content.decode('utf-8'))
459
 
460
- # Extract numbers from JSON structure
461
- numbers = self._extract_numbers_from_json(data)
462
 
463
- return {
464
- 'type': 'json',
465
- 'data': data,
466
- 'numbers': numbers,
467
- 'summary': f'JSON file with {len(data) if isinstance(data, (list, dict)) else 1} items'
468
- }
469
 
470
  except Exception as e:
471
- return {'error': f'JSON processing failed: {e}'}
472
 
473
- def _process_generic(self, content: bytes) -> Dict:
474
- """Process generic file"""
475
- try:
476
- # Try to decode as text first
477
- try:
478
- text = content.decode('utf-8')
479
- return self._process_text(content)
480
- except UnicodeDecodeError:
481
- # Binary file
482
- return {
483
- 'type': 'binary',
484
- 'size': len(content),
485
- 'summary': f'Binary file of {len(content)} bytes'
486
- }
487
-
488
- except Exception as e:
489
- return {'error': f'Generic processing failed: {e}'}
490
-
491
- def _extract_numbers_from_json(self, data, numbers=None):
492
- """Recursively extract numbers from JSON structure"""
493
- if numbers is None:
494
- numbers = []
495
 
496
- if isinstance(data, (int, float)):
497
- numbers.append(float(data))
498
- elif isinstance(data, dict):
499
- for value in data.values():
500
- self._extract_numbers_from_json(value, numbers)
501
- elif isinstance(data, list):
502
- for item in data:
503
- self._extract_numbers_from_json(item, numbers)
504
 
505
- return numbers
506
-
507
-
508
- class TextAnalyzerTool:
509
- """Tool for analyzing and extracting information from text"""
510
-
511
- def analyze(self, text: str, context: Dict = None) -> str:
512
- """Analyze text and extract relevant information"""
513
  try:
514
- # Basic keyword extraction
515
- keywords = self._extract_keywords(text)
 
 
 
 
516
 
517
- # Look for specific patterns based on question type
518
- if any(word in text.lower() for word in ['who', 'what', 'when', 'where']):
519
- return self._analyze_question_pattern(text, context)
520
 
521
- # Look for calculations
522
- if any(word in text.lower() for word in ['calculate', 'sum', 'total', 'average']):
523
- return self._analyze_calculation_pattern(text, context)
524
 
525
- # Default analysis
526
- return f"Analysis of text with keywords: {', '.join(keywords[:5])}"
 
 
 
 
 
 
527
 
528
  except Exception as e:
529
- return f"Analysis failed: {e}"
530
-
531
- def _extract_keywords(self, text: str) -> List[str]:
532
- """Extract important keywords from text"""
533
- # Simple keyword extraction
534
- words = re.findall(r'\b[A-Za-z]{3,}\b', text.lower())
535
-
536
- # Remove common stop words
537
- stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'man', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'its', 'let', 'put', 'say', 'she', 'too', 'use'}
538
-
539
- keywords = [word for word in words if word not in stop_words]
540
-
541
- # Return most frequent keywords
542
- from collections import Counter
543
- return [word for word, count in Counter(keywords).most_common(10)]
544
-
545
- def _analyze_question_pattern(self, text: str, context: Dict) -> str:
546
- """Analyze question patterns to extract answers"""
547
- # This is where you'd implement more sophisticated NLP
548
- # For now, return a simple analysis
549
-
550
- if context and 'search_data' in context:
551
- search_results = context['search_data']
552
- if search_results and isinstance(search_results, list) and len(search_results) > 0:
553
- return search_results[0].get('summary', 'No summary available')
554
-
555
- return "Unable to extract specific answer from question pattern"
556
-
557
- def _analyze_calculation_pattern(self, text: str, context: Dict) -> str:
558
- """Analyze calculation patterns"""
559
- if context and 'calculation' in context:
560
- return str(context['calculation'])
561
-
562
- # Extract numbers for potential calculation
563
- numbers = re.findall(r'\d+\.?\d*', text)
564
- if numbers:
565
- return f"Found numbers: {', '.join(numbers)}"
566
-
567
- return "No calculation pattern found"
568
-
569
-
570
- # Main execution functions
571
- def test_agent_on_random_question(api_base_url: str):
572
- """Test the agent on a random question"""
573
- agent = GAIAAgent(api_base_url)
574
 
575
- try:
576
- # Get random question
577
- response = requests.get(f"{api_base_url}/random-question")
578
- question = response.json()
579
 
580
- print("=" * 50)
581
- print("TESTING RANDOM QUESTION")
582
- print("=" * 50)
583
- print(f"Task ID: {question.get('task_id')}")
584
- print(f"Question: {question.get('question')}")
585
- print(f"File: {question.get('file_name', 'None')}")
586
- print("-" * 50)
587
 
588
- # Solve question
589
- start_time = time.time()
590
- answer = agent.solve_question(question)
591
- end_time = time.time()
592
-
593
- print(f"Agent Answer: {answer}")
594
- print(f"Processing Time: {end_time - start_time:.2f} seconds")
595
- print("=" * 50)
596
-
597
- return {
598
- 'task_id': question.get('task_id'),
599
- 'question': question.get('question'),
600
- 'agent_answer': answer,
601
- 'processing_time': end_time - start_time
602
- }
603
-
604
- except Exception as e:
605
- print(f"Error testing random question: {e}")
606
- return None
607
-
608
-
609
- def run_full_evaluation(api_base_url: str, username: str, agent_code_url: str):
610
- """Run the complete evaluation on all 20 questions"""
611
- agent = GAIAAgent(api_base_url)
612
-
613
- try:
614
- # Get all questions
615
- response = requests.get(f"{api_base_url}/questions")
616
- questions = response.json()
617
-
618
- print(f"Starting evaluation on {len(questions)} questions...")
619
-
620
- answers = []
621
- successful_answers = 0
622
 
623
  for i, question in enumerate(questions):
624
- print(f"\n{'='*60}")
625
- print(f"PROCESSING QUESTION {i+1}/{len(questions)}")
626
- print(f"{'='*60}")
627
- print(f"Task ID: {question.get('task_id')}")
628
- print(f"Question: {question.get('question')[:100]}...")
629
-
630
  try:
 
 
 
631
  start_time = time.time()
632
- answer = agent.solve_question(question)
633
- end_time = time.time()
634
 
635
- answers.append({
636
- 'task_id': question['task_id'],
637
- 'submitted_answer': answer
638
- })
 
 
639
 
640
- print(f"Answer: {answer}")
641
- print(f"Time: {end_time - start_time:.2f}s")
642
 
643
- if answer and answer != "Unable to determine answer":
644
- successful_answers += 1
 
 
 
 
 
 
 
 
 
645
 
646
  except Exception as e:
647
- print(f"Error processing question {i+1}: {e}")
648
- answers.append({
649
- 'task_id': question['task_id'],
650
- 'submitted_answer': "Processing error"
651
- })
652
-
653
- print(f"\n{'='*60}")
654
- print(f"EVALUATION COMPLETE")
655
- print(f"{'='*60}")
656
- print(f"Successfully processed: {successful_answers}/{len(questions)} questions")
657
- print(f"Success rate: {(successful_answers/len(questions)*100):.1f}%")
658
-
659
- # Submit results
660
- print(f"\nSubmitting results...")
661
- submission_result = submit_results(api_base_url, username, agent_code_url, answers)
662
-
663
- return {
664
- 'answers': answers,
665
- 'successful_answers': successful_answers,
666
- 'total_questions': len(questions),
667
- 'submission_result': submission_result
668
- }
669
-
670
- except Exception as e:
671
- print(f"Error in full evaluation: {e}")
672
- return None
673
-
674
-
675
- def submit_results(api_base_url: str, username: str, agent_code_url: str, answers: List[Dict]):
676
- """Submit results to the leaderboard"""
677
- try:
678
- submission_data = {
679
- 'username': username,
680
- 'agent_code': agent_code_url,
681
- 'answers': answers
682
- }
683
-
684
- response = requests.post(f"{api_base_url}/submit", json=submission_data)
685
-
686
- if response.status_code == 200:
687
- result = response.json()
688
- print(f"✅ Submission successful!")
689
- print(f"Score: {result.get('score', 'N/A')}%")
690
- print(f"Rank: {result.get('rank', 'N/A')}")
691
- return result
692
- else:
693
- print(f"❌ Submission failed: {response.status_code}")
694
- print(f"Response: {response.text}")
695
- return None
696
-
697
- except Exception as e:
698
- print(f"Error submitting results: {e}")
699
- return None
700
 
 
 
 
 
 
701
 
702
- # Example usage and testing functions
703
- if __name__ == "__main__":
704
- # Configuration - Replace with actual values
705
- API_BASE_URL = "https://your-api-endpoint.com" # Replace with actual API URL
706
- USERNAME = "your-huggingface-username" # Replace with your username
707
- AGENT_CODE_URL = "https://huggingface.co/spaces/your-username/gaia-agent/tree/main" # Replace with your space URL
708
-
709
- print("GAIA Agent Implementation")
710
- print("=" * 40)
711
-
712
- # Test on a few random questions first
713
- print("1. Testing on random questions...")
714
- for i in range(3):
715
- print(f"\n--- Random Test {i+1} ---")
716
- test_result = test_agent_on_random_question(API_BASE_URL)
717
- if test_result:
718
- print(f"✅ Test {i+1} completed")
719
- else:
720
- print(f"❌ Test {i+1} failed")
721
-
722
- # Ask user if they want to run full evaluation
723
- user_input = input("\nRun full evaluation on all 20 questions? (y/n): ")
724
 
725
- if user_input.lower() == 'y':
726
- print("\n" + "=" * 60)
727
- print("STARTING FULL EVALUATION")
728
- print("=" * 60)
729
 
730
- evaluation_result = run_full_evaluation(API_BASE_URL, USERNAME, AGENT_CODE_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
 
732
- if evaluation_result:
733
- print(f"\n🎉 Evaluation completed!")
734
- print(f"Final score: {evaluation_result.get('submission_result', {}).get('score', 'N/A')}%")
735
-
736
- if evaluation_result.get('submission_result', {}).get('score', 0) >= 30:
737
- print(f"🏆 CONGRATULATIONS! You've achieved the 30% threshold!")
738
- print(f"🎓 You've earned your Certificate of Completion!")
739
- else:
740
- print(f"📈 Keep improving! You need 30% to earn the certificate.")
741
- else:
742
- print(f"❌ Evaluation failed. Please check your implementation.")
743
-
744
- else:
745
- print("Evaluation cancelled. Use the test functions to debug your agent first.")
746
 
 
 
 
747
 
748
- # Additional utility functions for development and debugging
 
 
749
 
750
- def debug_question_analysis(api_base_url: str, task_id: str = None):
751
- """Debug question analysis for a specific question"""
752
- agent = GAIAAgent(api_base_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
 
754
- if task_id:
755
- # Get specific question (you'd need to implement this endpoint or find the question in the list)
756
- response = requests.get(f"{api_base_url}/questions")
757
- questions = response.json()
758
- question = next((q for q in questions if q.get('task_id') == task_id), None)
759
- else:
760
- # Get random question
761
- response = requests.get(f"{api_base_url}/random-question")
762
- question = response.json()
763
 
764
- if not question:
765
- print("Question not found")
766
- return
 
 
 
767
 
768
- print("QUESTION ANALYSIS DEBUG")
769
- print("=" * 40)
770
- print(f"Task ID: {question.get('task_id')}")
771
- print(f"Question: {question.get('question')}")
772
- print(f"File: {question.get('file_name', 'None')}")
773
- print("-" * 40)
774
 
775
- # Analyze question
776
- analysis = agent._analyze_question(question)
777
- print("Analysis Results:")
778
- for key, value in analysis.items():
779
- print(f" {key}: {value}")
780
 
781
- # Create plan
782
- plan = agent._create_execution_plan(analysis, question)
783
- print(f"\nExecution Plan:")
784
- for i, step in enumerate(plan):
785
- print(f" {i+1}. {step['action']} (priority: {step['priority']})")
786
 
787
- return question, analysis, plan
788
 
 
 
 
 
 
 
 
 
 
 
 
 
789
 
790
- def benchmark_agent_performance(api_base_url: str, num_tests: int = 10):
791
- """Benchmark agent performance on multiple random questions"""
792
- agent = GAIAAgent(api_base_url)
793
-
794
- results = []
795
- total_time = 0
796
- successful_answers = 0
797
-
798
- print(f"BENCHMARKING AGENT ({num_tests} questions)")
799
- print("=" * 50)
800
-
801
- for i in range(num_tests):
802
- try:
803
- response = requests.get(f"{api_base_url}/random-question")
804
- question = response.json()
805
-
806
- start_time = time.time()
807
- answer = agent.solve_question(question)
808
- end_time = time.time()
809
-
810
- processing_time = end_time - start_time
811
- total_time += processing_time
812
-
813
- if answer and answer != "Unable to determine answer":
814
- successful_answers += 1
815
- status = "✅"
816
- else:
817
- status = ""
818
-
819
- print(f"{status} Question {i+1}: {processing_time:.2f}s - {answer[:50]}...")
820
-
821
- results.append({
822
- 'question_id': i+1,
823
- 'task_id': question.get('task_id'),
824
- 'answer': answer,
825
- 'processing_time': processing_time,
826
- 'success': answer != "Unable to determine answer"
827
- })
828
-
829
- except Exception as e:
830
- print(f"❌ Question {i+1}: Error - {e}")
831
- results.append({
832
- 'question_id': i+1,
833
- 'error': str(e),
834
- 'success': False
835
- })
836
-
837
- # Print summary
838
- print("\n" + "=" * 50)
839
- print("BENCHMARK RESULTS")
840
- print("=" * 50)
841
- print(f"Successful answers: {successful_answers}/{num_tests} ({successful_answers/num_tests*100:.1f}%)")
842
- print(f"Average processing time: {total_time/num_tests:.2f}s")
843
- print(f"Total time: {total_time:.2f}s")
844
-
845
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GAIA Benchmark AI Agent - Hugging Face Space
4
+ ============================================
5
+
6
+ A Gradio-based web interface for running GAIA benchmark evaluations
7
+ on Hugging Face Spaces with GPU acceleration.
8
+ """
9
+
10
+ import gradio as gr
11
+ import torch
12
  import json
13
+ import os
14
+ import logging
 
 
15
  import time
16
+ import re
17
+ from datetime import datetime
18
+ from typing import Dict, List, Optional, Tuple, Any
19
+ from dataclasses import dataclass
20
+ import pandas as pd
21
+ from pathlib import Path
22
 
23
+ # Core ML libraries
24
+ from transformers import (
25
+ AutoTokenizer,
26
+ AutoModelForCausalLM,
27
+ BitsAndBytesConfig,
28
+ pipeline
29
+ )
30
+ from datasets import load_dataset
31
+ from huggingface_hub import HfApi, hf_hub_download
32
+
33
+ # Setup logging
34
+ logging.basicConfig(level=logging.INFO)
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # ================================
38
+ # CORE DATA STRUCTURES
39
+ # ================================
40
+
41
+ @dataclass
42
+ class GAIAQuestion:
43
+ """Structure for GAIA benchmark questions"""
44
+ task_id: str
45
+ question: str
46
+ level: int
47
+ final_answer: Optional[str] = None
48
+ file_name: Optional[str] = None
49
+ annotator_metadata: Optional[Dict] = None
50
+
51
+ @classmethod
52
+ def from_dict(cls, data: dict):
53
+ return cls(**{k: v for k, v in data.items() if k in cls.__annotations__})
54
+
55
+ @dataclass
56
+ class GAIAResponse:
57
+ """Structure for GAIA responses"""
58
+ task_id: str
59
+ model_answer: str
60
+ reasoning_trace: str
61
+ final_answer: str
62
+ processing_time: float = 0.0
63
+ confidence_score: float = 0.0
64
+
65
+ # ================================
66
+ # GAIA PROMPT MANAGEMENT
67
+ # ================================
68
+
69
+ class GAIAPromptManager:
70
+ """Manages GAIA-specific prompting and formatting"""
71
 
72
+ GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
73
+
74
+ FINAL ANSWER: [YOUR FINAL ANSWER]
75
+
76
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
77
+
78
+ @staticmethod
79
+ def create_gaia_prompt(question: str) -> str:
80
+ """Create properly formatted GAIA prompt"""
81
+ return f"{GAIAPromptManager.GAIA_SYSTEM_PROMPT}\n\nQuestion: {question}\n\nLet me think step by step:"
82
+
83
+ @staticmethod
84
+ def extract_final_answer(response: str) -> Tuple[str, str]:
85
+ """Extract final answer and reasoning from model response"""
86
+ final_answer_pattern = r"FINAL ANSWER:\s*(.+?)(?:\n|$)"
87
+ match = re.search(final_answer_pattern, response, re.IGNORECASE | re.DOTALL)
88
+
89
+ if match:
90
+ final_answer = match.group(1).strip()
91
+ reasoning_end = match.start()
92
+ reasoning = response[:reasoning_end].strip()
93
+ else:
94
+ lines = response.strip().split('\n')
95
+ final_answer = lines[-1].strip() if lines else ""
96
+ reasoning = '\n'.join(lines[:-1]) if len(lines) > 1 else response
97
 
98
+ return final_answer, reasoning
99
+
100
+ # ================================
101
+ # HF SPACES OPTIMIZED MODEL MANAGER
102
+ # ================================
103
+
104
+ class HFSpaceModelManager:
105
+ """Hugging Face Spaces optimized model manager"""
106
+
107
+ # Space-friendly models with different capabilities
108
+ SPACE_MODELS = {
109
+ "Fast & Light": {
110
+ "name": "microsoft/DialoGPT-medium",
111
+ "size": "~345MB",
112
+ "speed": "Fast",
113
+ "quality": "Good",
114
+ "gpu_required": False
115
+ },
116
+ "Balanced": {
117
+ "name": "stabilityai/stablelm-zephyr-3b",
118
+ "size": "~3GB",
119
+ "speed": "Medium",
120
+ "quality": "Better",
121
+ "gpu_required": True
122
+ },
123
+ "High Quality": {
124
+ "name": "HuggingFaceH4/zephyr-7b-beta",
125
+ "size": "~7GB",
126
+ "speed": "Slower",
127
+ "quality": "Best",
128
+ "gpu_required": True
129
+ },
130
+ "Instruction Following": {
131
+ "name": "mistralai/Mistral-7B-Instruct-v0.1",
132
+ "size": "~7GB",
133
+ "speed": "Medium",
134
+ "quality": "Excellent",
135
+ "gpu_required": True
136
  }
137
+ }
138
+
139
+ def __init__(self, model_choice: str = "Fast & Light"):
140
+ self.model_config = self.SPACE_MODELS[model_choice]
141
+ self.model_name = self.model_config["name"]
142
+ self.tokenizer = None
143
+ self.model = None
144
+ self.pipeline = None
145
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
146
+
147
+ def load_model(self, progress_callback=None) -> str:
148
+ """Load model with progress updates"""
149
+ try:
150
+ if progress_callback:
151
+ progress_callback(0.1, "Loading tokenizer...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ # Load tokenizer
154
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
155
+ if self.tokenizer.pad_token is None:
156
+ self.tokenizer.pad_token = self.tokenizer.eos_token
157
 
158
+ if progress_callback:
159
+ progress_callback(0.3, "Configuring model...")
160
+
161
+ # Configure quantization for GPU spaces
162
+ quantization_config = None
163
+ if self.device == "cuda" and "7b" in self.model_name.lower():
164
+ quantization_config = BitsAndBytesConfig(
165
+ load_in_4bit=True,
166
+ bnb_4bit_compute_dtype=torch.float16,
167
+ bnb_4bit_use_double_quant=True,
168
+ bnb_4bit_quant_type="nf4"
169
+ )
170
+
171
+ if progress_callback:
172
+ progress_callback(0.6, "Loading model weights...")
173
+
174
+ # Load model
175
+ self.model = AutoModelForCausalLM.from_pretrained(
176
+ self.model_name,
177
+ quantization_config=quantization_config,
178
+ device_map="auto" if self.device == "cuda" else None,
179
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
180
+ trust_remote_code=True
181
+ )
182
+
183
+ if progress_callback:
184
+ progress_callback(0.9, "Creating pipeline...")
185
+
186
+ # Create pipeline
187
+ self.pipeline = pipeline(
188
+ "text-generation",
189
+ model=self.model,
190
+ tokenizer=self.tokenizer,
191
+ max_new_tokens=384,
192
+ temperature=0.7,
193
+ do_sample=True,
194
+ pad_token_id=self.tokenizer.eos_token_id,
195
+ device=0 if self.device == "cuda" else -1
196
+ )
197
+
198
+ if progress_callback:
199
+ progress_callback(1.0, "Model loaded successfully!")
200
 
201
+ return f"✅ Model '{self.model_name}' loaded successfully on {self.device.upper()}"
202
+
203
+ except Exception as e:
204
+ error_msg = f"❌ Error loading model: {str(e)}"
205
+ logger.error(error_msg)
206
+ return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ def generate_response(self, prompt: str, max_tokens: int = 384) -> str:
209
+ """Generate response with error handling"""
210
+ if self.pipeline is None:
211
+ return " Model not loaded. Please load a model first."
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  try:
214
+ # Truncate prompt if too long
215
+ max_input_length = 1000
216
+ if len(prompt) > max_input_length:
217
+ prompt = prompt[:max_input_length] + "..."
218
+
219
+ outputs = self.pipeline(
220
+ prompt,
221
+ max_new_tokens=max_tokens,
222
+ temperature=0.7,
223
+ do_sample=True,
224
+ return_full_text=False,
225
+ pad_token_id=self.tokenizer.eos_token_id
226
+ )
227
+
228
+ response = outputs[0]['generated_text'].strip()
229
+ return response
230
+
231
+ except Exception as e:
232
+ return f"❌ Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ # ================================
235
+ # DATASET MANAGEMENT
236
+ # ================================
237
 
238
+ class GAIADatasetManager:
239
+ """Manages GAIA dataset loading and sample generation"""
240
 
241
+ @staticmethod
242
+ def load_gaia_dataset(split: str = "test", max_questions: int = None) -> Tuple[List[GAIAQuestion], str]:
243
+ """Load GAIA dataset from Hugging Face Hub"""
244
  try:
245
+ dataset = load_dataset("gaia-benchmark/GAIA", split=split, trust_remote_code=True)
 
246
 
247
+ questions = []
248
+ items = dataset[:max_questions] if max_questions else dataset
 
 
249
 
250
+ for i, item in enumerate(items):
251
+ question = GAIAQuestion(
252
+ task_id=item.get('task_id', f'gaia_{split}_{i:03d}'),
253
+ question=item['Question'],
254
+ level=item['Level'],
255
+ final_answer=item.get('Final answer', None),
256
+ file_name=item.get('file_name', None),
257
+ annotator_metadata=item.get('Annotator Metadata', None)
258
+ )
259
+ questions.append(question)
260
 
261
+ status = f"✅ Loaded {len(questions)} questions from GAIA {split} split"
262
+ return questions, status
263
 
264
  except Exception as e:
265
+ error_msg = f" Error loading GAIA dataset: {str(e)}"
266
+ return GAIADatasetManager.get_sample_questions(), error_msg
267
 
268
+ @staticmethod
269
+ def get_sample_questions() -> List[GAIAQuestion]:
270
+ """Get sample questions for testing"""
271
+ sample_data = [
272
+ {
273
+ "task_id": "sample_001",
274
+ "question": "What is the capital of France?",
275
+ "level": 1,
276
+ "final_answer": "Paris"
277
+ },
278
+ {
279
+ "task_id": "sample_002",
280
+ "question": "Calculate 144 divided by 12.",
281
+ "level": 1,
282
+ "final_answer": "12"
283
+ },
284
+ {
285
+ "task_id": "sample_003",
286
+ "question": "What is the largest planet in our solar system?",
287
+ "level": 1,
288
+ "final_answer": "Jupiter"
289
+ },
290
+ {
291
+ "task_id": "sample_004",
292
+ "question": "Convert 100 degrees Celsius to Fahrenheit.",
293
+ "level": 2,
294
+ "final_answer": "212"
295
+ },
296
+ {
297
+ "task_id": "sample_005",
298
+ "question": "List the first three even numbers greater than zero.",
299
+ "level": 1,
300
+ "final_answer": "2, 4, 6"
301
+ },
302
+ {
303
+ "task_id": "sample_006",
304
+ "question": "What year did the Berlin Wall fall?",
305
+ "level": 1,
306
+ "final_answer": "1989"
307
+ },
308
+ {
309
+ "task_id": "sample_007",
310
+ "question": "What is the chemical symbol for water?",
311
+ "level": 1,
312
+ "final_answer": "H2O"
313
+ },
314
+ {
315
+ "task_id": "sample_008",
316
+ "question": "How many continents are there?",
317
+ "level": 1,
318
+ "final_answer": "7"
319
+ }
320
+ ]
321
 
322
+ return [GAIAQuestion.from_dict(data) for data in sample_data]
323
 
324
+ # ================================
325
+ # MAIN GAIA AGENT FOR HF SPACES
326
+ # ================================
327
 
328
+ class GAIASpaceAgent:
329
+ """Main GAIA agent optimized for Hugging Face Spaces"""
 
 
 
330
 
331
+ def __init__(self):
332
+ self.model_manager = None
333
+ self.prompt_manager = GAIAPromptManager()
334
+ self.current_model = None
335
+ self.evaluation_results: List[GAIAResponse] = []
336
+
337
+ def initialize_model(self, model_choice: str, progress=gr.Progress()) -> str:
338
+ """Initialize model with progress tracking"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  try:
340
+ progress(0, desc="Initializing model manager...")
341
+ self.model_manager = HFSpaceModelManager(model_choice)
342
+ self.current_model = model_choice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
+ # Load model with progress updates
345
+ def progress_callback(value, desc):
346
+ progress(value, desc=desc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
+ result = self.model_manager.load_model(progress_callback)
 
 
 
 
 
 
349
 
350
+ # Clear any previous results when changing models
351
+ self.evaluation_results = []
352
 
353
+ return result
 
 
 
 
 
354
 
355
  except Exception as e:
356
+ return f"❌ Failed to initialize model: {str(e)}"
357
 
358
+ def process_single_question(self, question_text: str, progress=gr.Progress()) -> Tuple[str, str, str, float]:
359
+ """Process a single question with detailed output"""
360
+ if self.model_manager is None or self.model_manager.pipeline is None:
361
+ return "❌ No model loaded", "", "", 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
+ start_time = time.time()
 
 
 
 
 
 
 
364
 
 
 
 
 
 
 
 
 
365
  try:
366
+ progress(0.2, desc="Creating GAIA prompt...")
367
+
368
+ # Create GAIA prompt
369
+ prompt = self.prompt_manager.create_gaia_prompt(question_text)
370
+
371
+ progress(0.4, desc="Generating response...")
372
 
373
+ # Generate response
374
+ raw_response = self.model_manager.generate_response(prompt)
 
375
 
376
+ progress(0.8, desc="Extracting final answer...")
 
 
377
 
378
+ # Extract final answer and reasoning
379
+ final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
380
+
381
+ processing_time = time.time() - start_time
382
+
383
+ progress(1.0, desc="Complete!")
384
+
385
+ return final_answer, raw_response, reasoning, processing_time
386
 
387
  except Exception as e:
388
+ processing_time = time.time() - start_time
389
+ error_msg = f"❌ Error processing question: {str(e)}"
390
+ return error_msg, "", "", processing_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
+ def batch_evaluate(self, questions: List[GAIAQuestion], progress=gr.Progress()) -> Tuple[str, str, str]:
393
+ """Evaluate multiple questions with progress tracking"""
394
+ if self.model_manager is None:
395
+ return "❌ No model loaded", "", ""
396
 
397
+ results = []
398
+ total_questions = len(questions)
 
 
 
 
 
399
 
400
+ progress(0, desc=f"Starting evaluation of {total_questions} questions...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  for i, question in enumerate(questions):
 
 
 
 
 
 
403
  try:
404
+ progress((i + 1) / total_questions,
405
+ desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
406
+
407
  start_time = time.time()
 
 
408
 
409
+ # Create prompt and generate response
410
+ prompt = self.prompt_manager.create_gaia_prompt(question.question)
411
+ raw_response = self.model_manager.generate_response(prompt)
412
+
413
+ # Extract final answer
414
+ final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
415
 
416
+ processing_time = time.time() - start_time
 
417
 
418
+ # Create response object
419
+ response = GAIAResponse(
420
+ task_id=question.task_id,
421
+ model_answer=raw_response,
422
+ reasoning_trace=reasoning,
423
+ final_answer=final_answer,
424
+ processing_time=processing_time
425
+ )
426
+
427
+ results.append(response)
428
+ self.evaluation_results.append(response)
429
 
430
  except Exception as e:
431
+ logger.error(f"Error processing {question.task_id}: {e}")
432
+ error_response = GAIAResponse(
433
+ task_id=question.task_id,
434
+ model_answer=f"Error: {str(e)}",
435
+ reasoning_trace="Processing failed",
436
+ final_answer="ERROR",
437
+ processing_time=0.0
438
+ )
439
+ results.append(error_response)
440
+ self.evaluation_results.append(error_response)
441
+
442
+ # Generate summary
443
+ summary = self._generate_summary(results)
444
+
445
+ # Generate detailed results
446
+ detailed_results = self._generate_detailed_results(results, questions)
447
+
448
+ # Generate downloadable JSONL
449
+ jsonl_content = self._generate_jsonl(results)
450
+
451
+ return summary, detailed_results, jsonl_content
452
+
453
+ def _generate_summary(self, results: List[GAIAResponse]) -> str:
454
+ """Generate evaluation summary"""
455
+ total = len(results)
456
+ errors = sum(1 for r in results if r.final_answer == "ERROR")
457
+ successful = total - errors
458
+ avg_time = sum(r.processing_time for r in results) / total if total > 0 else 0
459
+ total_time = sum(r.processing_time for r in results)
460
+
461
+ summary = f"""
462
+ # 📊 GAIA Evaluation Summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
+ ## Overall Statistics
465
+ - **Total Questions**: {total}
466
+ - **Successful**: {successful}
467
+ - **Errors**: {errors}
468
+ - **Success Rate**: {(successful/total*100):.1f}%
469
 
470
+ ## Performance Metrics
471
+ - **Average Processing Time**: {avg_time:.2f}s
472
+ - **Total Processing Time**: {total_time:.2f}s
473
+ - **Questions per Minute**: {(total/(total_time/60)):.1f}
474
+
475
+ ## Model Information
476
+ - **Model**: {self.current_model}
477
+ - **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'}
478
+ """
479
+ return summary
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
+ def _generate_detailed_results(self, results: List[GAIAResponse], questions: List[GAIAQuestion]) -> str:
482
+ """Generate detailed results breakdown"""
483
+ detailed = "# 📋 Detailed Results\n\n"
 
484
 
485
+ for i, (result, question) in enumerate(zip(results, questions), 1):
486
+ status = "✅" if result.final_answer != "ERROR" else "❌"
487
+
488
+ detailed += f"""
489
+ ## Question {i}: {question.task_id} {status}
490
+
491
+ **Question**: {question.question}
492
+
493
+ **Model Answer**: {result.final_answer}
494
+
495
+ **Expected Answer**: {question.final_answer if question.final_answer else 'N/A'}
496
+
497
+ **Processing Time**: {result.processing_time:.2f}s
498
+
499
+ **Level**: {question.level}
500
+
501
+ ---
502
+ """
503
+
504
+ return detailed
505
+
506
+ def _generate_jsonl(self, results: List[GAIAResponse]) -> str:
507
+ """Generate JSONL format for download"""
508
+ jsonl_lines = []
509
+ for result in results:
510
+ line = {
511
+ "task_id": result.task_id,
512
+ "model_answer": result.model_answer,
513
+ "reasoning_trace": result.reasoning_trace
514
+ }
515
+ jsonl_lines.append(json.dumps(line))
516
 
517
+ return '\n'.join(jsonl_lines)
518
+
519
+ # ================================
520
+ # GLOBAL AGENT INSTANCE
521
+ # ================================
522
+
523
+ # Initialize global agent
524
+ gaia_agent = GAIASpaceAgent()
 
 
 
 
 
 
525
 
526
+ # ================================
527
+ # GRADIO INTERFACE FUNCTIONS
528
+ # ================================
529
 
530
+ def load_model_interface(model_choice: str, progress=gr.Progress()):
531
+ """Interface function for model loading"""
532
+ return gaia_agent.initialize_model(model_choice, progress)
533
 
534
+ def single_question_interface(question: str, progress=gr.Progress()):
535
+ """Interface function for single question processing"""
536
+ if not question.strip():
537
+ return "Please enter a question", "", "", "0.00s"
538
+
539
+ final_answer, full_response, reasoning, proc_time = gaia_agent.process_single_question(question, progress)
540
+
541
+ return (
542
+ final_answer,
543
+ full_response,
544
+ reasoning,
545
+ f"{proc_time:.2f}s"
546
+ )
547
+
548
+ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=gr.Progress()):
549
+ """Interface function for batch evaluation"""
550
+ if gaia_agent.model_manager is None:
551
+ return "❌ Please load a model first", "", ""
552
 
553
+ progress(0.1, desc="Loading dataset...")
 
 
 
 
 
 
 
 
554
 
555
+ # Load questions based on choice
556
+ if dataset_choice == "Sample Questions":
557
+ questions = GAIADatasetManager.get_sample_questions()
558
+ status_msg = f"✅ Loaded {len(questions)} sample questions"
559
+ else:
560
+ questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
561
 
562
+ # Limit questions
563
+ if max_questions and len(questions) > max_questions:
564
+ questions = questions[:max_questions]
 
 
 
565
 
566
+ progress(0.2, desc=f"{status_msg}. Starting evaluation...")
 
 
 
 
567
 
568
+ # Run evaluation
569
+ summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
 
 
 
570
 
571
+ return summary, detailed, jsonl
572
 
573
+ def get_model_info(model_choice: str):
574
+ """Get information about selected model"""
575
+ if model_choice in HFSpaceModelManager.SPACE_MODELS:
576
+ config = HFSpaceModelManager.SPACE_MODELS[model_choice]
577
+ return f"""
578
+ **Model**: {config['name']}
579
+ **Size**: {config['size']}
580
+ **Speed**: {config['speed']}
581
+ **Quality**: {config['quality']}
582
+ **GPU Required**: {'Yes' if config['gpu_required'] else 'No'}
583
+ """
584
+ return "Model information not available"
585
 
586
+ # ================================
587
+ # GRADIO APP CREATION
588
+ # ================================
589
+
590
+ def create_gaia_app():
591
+ """Create the main Gradio application"""
592
+
593
+ with gr.Blocks(
594
+ title="GAIA Benchmark AI Agent",
595
+ theme=gr.themes.Soft(),
596
+ css="""
597
+ .gradio-container {
598
+ font-family: 'Arial', sans-serif;
599
+ }
600
+ .main-header {
601
+ text-align: center;
602
+ background: linear-gradient(45deg, #2196F3, #21CBF3);
603
+ -webkit-background-clip: text;
604
+ -webkit-text-fill-color: transparent;
605
+ font-size: 2.5em;
606
+ font-weight: bold;
607
+ margin-bottom: 20px;
608
+ }
609
+ """
610
+ ) as app:
611
+
612
+ # Header
613
+ gr.HTML("""
614
+ <div class="main-header">
615
+ 🧠 GAIA Benchmark AI Agent
616
+ </div>
617
+ <p style="text-align: center; font-size: 1.2em; color: #666;">
618
+ Evaluate AI models on the GAIA benchmark with step-by-step reasoning
619
+ </p>
620
+ """)
621
+
622
+ with gr.Tabs():
623
+
624
+ # ===============================
625
+ # TAB 1: MODEL SETUP
626
+ # ===============================
627
+ with gr.Tab("🔧 Model Setup"):
628
+ gr.Markdown("## Choose and Load Your Model")
629
+
630
+ with gr.Row():
631
+ with gr.Column(scale=2):
632
+ model_dropdown = gr.Dropdown(
633
+ choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
634
+ value="Fast & Light",
635
+ label="Select Model",
636
+ info="Choose based on your quality vs speed preference"
637
+ )
638
+
639
+ model_info = gr.Markdown(
640
+ value=get_model_info("Fast & Light"),
641
+ label="Model Information"
642
+ )
643
+
644
+ load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
645
+
646
+ with gr.Column(scale=1):
647
+ gpu_info = gr.Markdown(f"""
648
+ ### 🖥️ System Info
649
+ **CUDA Available**: {torch.cuda.is_available()}
650
+ {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
651
+ """)
652
+
653
+ model_status = gr.Textbox(
654
+ label="Model Status",
655
+ value="No model loaded",
656
+ interactive=False
657
+ )
658
+
659
+ # Update model info when selection changes
660
+ model_dropdown.change(
661
+ fn=get_model_info,
662
+ inputs=[model_dropdown],
663
+ outputs=[model_info]
664
+ )
665
+
666
+ # Load model when button clicked
667
+ load_btn.click(
668
+ fn=load_model_interface,
669
+ inputs=[model_dropdown],
670
+ outputs=[model_status]
671
+ )
672
+
673
+ # ===============================
674
+ # TAB 2: SINGLE QUESTION
675
+ # ===============================
676
+ with gr.Tab("❓ Single Question"):
677
+ gr.Markdown("## Test Individual Questions")
678
+
679
+ with gr.Row():
680
+ with gr.Column():
681
+ question_input = gr.Textbox(
682
+ label="Enter your question",
683
+ placeholder="e.g., What is the capital of France?",
684
+ lines=3
685
+ )
686
+
687
+ process_btn = gr.Button("🤔 Process Question", variant="primary")
688
+
689
+ # Example questions
690
+ gr.Markdown("### 💡 Example Questions:")
691
+ example_questions = [
692
+ "What is the capital of France?",
693
+ "Calculate 144 divided by 12",
694
+ "What is the largest planet in our solar system?",
695
+ "Convert 100 degrees Celsius to Fahrenheit"
696
+ ]
697
+
698
+ for i, example in enumerate(example_questions):
699
+ gr.Button(
700
+ f"📝 {example}",
701
+ size="sm"
702
+ ).click(
703
+ lambda x=example: x,
704
+ outputs=[question_input]
705
+ )
706
+
707
+ with gr.Column():
708
+ final_answer_output = gr.Textbox(
709
+ label="🎯 Final Answer",
710
+ interactive=False
711
+ )
712
+
713
+ processing_time = gr.Textbox(
714
+ label="⏱️ Processing Time",
715
+ interactive=False
716
+ )
717
+
718
+ with gr.Accordion("🧠 Full Response", open=False):
719
+ full_response = gr.Textbox(
720
+ label="Complete Model Response",
721
+ lines=8,
722
+ interactive=False
723
+ )
724
+
725
+ with gr.Accordion("🔍 Reasoning Trace", open=False):
726
+ reasoning_trace = gr.Textbox(
727
+ label="Step-by-step Reasoning",
728
+ lines=6,
729
+ interactive=False
730
+ )
731
+
732
+ # Process single question
733
+ process_btn.click(
734
+ fn=single_question_interface,
735
+ inputs=[question_input],
736
+ outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
737
+ )
738
+
739
+ # ===============================
740
+ # TAB 3: BATCH EVALUATION
741
+ # ===============================
742
+ with gr.Tab("📊 Batch Evaluation"):
743
+ gr.Markdown("## Evaluate Multiple Questions")
744
+
745
+ with gr.Row():
746
+ dataset_choice = gr.Radio(
747
+ choices=["Sample Questions", "GAIA Test Set"],
748
+ value="Sample Questions",
749
+ label="Dataset Choice",
750
+ info="Start with sample questions to test your setup"
751
+ )
752
+
753
+ max_questions = gr.Slider(
754
+ minimum=1,
755
+ maximum=50,
756
+ value=5,
757
+ step=1,
758
+ label="Max Questions",
759
+ info="Number of questions to evaluate"
760
+ )
761
+
762
+ evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
763
+
764
+ with gr.Row():
765
+ with gr.Column():
766
+ summary_output = gr.Markdown(
767
+ label="📊 Evaluation Summary",
768
+ value="No evaluation completed yet"
769
+ )
770
+
771
+ with gr.Column():
772
+ download_output = gr.File(
773
+ label="💾 Download Results (JSONL)",
774
+ visible=False
775
+ )
776
+
777
+ with gr.Accordion("📋 Detailed Results", open=False):
778
+ detailed_output = gr.Markdown(
779
+ value="Run an evaluation to see detailed results"
780
+ )
781
+
782
+ # Batch evaluation
783
+ def batch_eval_with_download(*args):
784
+ summary, detailed, jsonl_content = batch_evaluate_interface(*args)
785
+
786
+ # Save JSONL for download
787
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
788
+ filename = f"gaia_results_{timestamp}.jsonl"
789
+
790
+ with open(filename, 'w') as f:
791
+ f.write(jsonl_content)
792
+
793
+ return summary, detailed, filename
794
+
795
+ evaluate_btn.click(
796
+ fn=batch_eval_with_download,
797
+ inputs=[dataset_choice, max_questions],
798
+ outputs=[summary_output, detailed_output, download_output]
799
+ ).then(
800
+ lambda: gr.update(visible=True),
801
+ outputs=[download_output]
802
+ )
803
+
804
+ # ===============================
805
+ # TAB 4: INFORMATION
806
+ # ===============================
807
+ with gr.Tab("ℹ️ Information"):
808
+ gr.Markdown("""
809
+ # 🧠 GAIA Benchmark AI Agent
810
+
811
+ ## What is GAIA?
812
+ GAIA (General AI Assistant) is a benchmark designed to test AI assistants on real-world questions that require:
813
+ - **Reasoning**: Multi-step logical thinking
814
+ - **Multi-modality**: Handling text, images, and other file types
815
+ - **Web browsing**: Finding and using external information
816
+ - **Tool use**: Calculator, code execution, etc.
817
+
818
+ ## 🎯 How to Use This Space
819
+
820
+ ### 1. Model Setup
821
+ - Choose a model based on your needs (speed vs quality)
822
+ - Load the model (this may take a few minutes)
823
+ - Wait for "Model loaded successfully" message
824
+
825
+ ### 2. Test Single Questions
826
+ - Start with the "Single Question" tab
827
+ - Try example questions to verify everything works
828
+ - Enter your own questions to test model capabilities
829
+
830
+ ### 3. Batch Evaluation
831
+ - Use "Sample Questions" first to test your setup
832
+ - Then try "GAIA Test Set" for real benchmark evaluation
833
+ - Download results in JSONL format for submission
834
+
835
+ ## 📊 Model Recommendations
836
+
837
+ | Model | Best For | Memory | Speed | Quality |
838
+ |-------|----------|---------|-------|---------|
839
+ | Fast & Light | Quick testing | Low | Fast | Good |
840
+ | Balanced | General use | Medium | Medium | Better |
841
+ | High Quality | Best results | High | Slow | Best |
842
+ | Instruction Following | Complex reasoning | High | Medium | Excellent |
843
+
844
+ ## 🔗 Resources
845
+ - [GAIA Paper](https://arxiv.org/abs/2311.12983)
846
+ - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
847
+ - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)
848
+
849
+ ## 🚀 Output Format
850
+ Results are saved in GAIA leaderboard format:
851
+ ```json
852
+ {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[REASONING]"}
853
+ ```
854
+
855
+ ## ⚡ Tips for Best Results
856
+ 1. **Start Small**: Test with sample questions first
857
+ 2. **Choose Right Model**: Balance speed vs quality for your needs
858
+ 3. **Monitor GPU**: Larger models need GPU acceleration
859
+ 4. **Download Results**: Save JSONL files for leaderboard submission
860
+ """)
861
+
862
+ return app
863
+
864
+ # ================================
865
+ # MAIN APPLICATION
866
+ # ================================
867
+
868
+ if __name__ == "__main__":
869
+ # Create and launch the Gradio app
870
+ app = create_gaia_app()
871
+ app.launch(
872
+ server_name="0.0.0.0",
873
+ server_port=7860,
874
+ share=False
875
+ )