Ashokdll commited on
Commit
770a217
·
verified ·
1 Parent(s): 2bc7f54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +694 -87
app.py CHANGED
@@ -1,9 +1,9 @@
1
  #!/usr/bin/env python3
2
  """
3
- GAIA Benchmark AI Agent - Hugging Face Space
4
- ============================================
5
  A Gradio-based web interface for running GAIA benchmark evaluations
6
- on Hugging Face Spaces with GPU acceleration.
7
  """
8
 
9
  import gradio as gr
@@ -27,22 +27,54 @@ from transformers import (
27
  pipeline
28
  )
29
  from datasets import load_dataset
30
- from huggingface_hub import HfApi, hf_hub_download
31
-
32
- # Import leaderboard integration
33
- from gaia_leaderboard_integration import (
34
- enhanced_gaia_agent,
35
- run_custom_benchmark_interface,
36
- load_test_questions_interface,
37
- preview_dataset_structure_interface,
38
- get_leaderboard_info,
39
- get_question_selection_info
40
- )
41
 
42
  # Setup logging
43
  logging.basicConfig(level=logging.INFO)
44
  logger = logging.getLogger(__name__)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # ================================
47
  # CORE DATA STRUCTURES
48
  # ================================
@@ -79,7 +111,9 @@ class GAIAPromptManager:
79
  """Manages GAIA-specific prompting and formatting"""
80
 
81
  GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
 
82
  FINAL ANSWER: [YOUR FINAL ANSWER]
 
83
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
84
 
85
  @staticmethod
@@ -233,42 +267,137 @@ class HFSpaceModelManager:
233
  return f"❌ Error generating response: {str(e)}"
234
 
235
  # ================================
236
- # DATASET MANAGEMENT
237
  # ================================
238
 
239
  class GAIADatasetManager:
240
- """Manages GAIA dataset loading and sample generation"""
241
 
242
  @staticmethod
243
- def load_gaia_dataset(split: str = "test", max_questions: int = None) -> Tuple[List[GAIAQuestion], str]:
244
- """Load GAIA dataset from Hugging Face Hub"""
 
 
 
245
  try:
246
- dataset = load_dataset("gaia-benchmark/GAIA", split=split, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  questions = []
249
  items = dataset[:max_questions] if max_questions else dataset
250
 
251
  for i, item in enumerate(items):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  question = GAIAQuestion(
253
- task_id=item.get('task_id', f'gaia_{split}_{i:03d}'),
254
- question=item['Question'],
255
- level=item['Level'],
256
- final_answer=item.get('Final answer', None),
257
- file_name=item.get('file_name', None),
258
- annotator_metadata=item.get('Annotator Metadata', None)
259
  )
260
  questions.append(question)
261
 
262
  status = f"✅ Loaded {len(questions)} questions from GAIA {split} split"
 
263
  return questions, status
264
 
265
  except Exception as e:
266
  error_msg = f"❌ Error loading GAIA dataset: {str(e)}"
267
- return GAIADatasetManager.get_sample_questions(), error_msg
 
 
 
 
 
268
 
269
  @staticmethod
270
  def get_sample_questions() -> List[GAIAQuestion]:
271
- """Get sample questions for testing"""
272
  sample_data = [
273
  {
274
  "task_id": "sample_001",
@@ -317,10 +446,133 @@ class GAIADatasetManager:
317
  "question": "How many continents are there?",
318
  "level": 1,
319
  "final_answer": "7"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  }
321
  ]
322
 
323
  return [GAIAQuestion.from_dict(data) for data in sample_data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  # ================================
326
  # MAIN GAIA AGENT FOR HF SPACES
@@ -440,18 +692,22 @@ class GAIASpaceAgent:
440
 
441
  summary = f"""
442
  # 📊 GAIA Evaluation Summary
 
443
  ## Overall Statistics
444
  - **Total Questions**: {total}
445
  - **Successful**: {successful}
446
  - **Errors**: {errors}
447
  - **Success Rate**: {(successful/total*100):.1f}%
 
448
  ## Performance Metrics
449
  - **Average Processing Time**: {avg_time:.2f}s
450
  - **Total Processing Time**: {total_time:.2f}s
451
  - **Questions per Minute**: {(total/(total_time/60)):.1f}
 
452
  ## Model Information
453
  - **Model**: {self.current_model}
454
  - **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'}
 
455
  """
456
  return summary
457
 
@@ -464,11 +720,17 @@ class GAIASpaceAgent:
464
 
465
  detailed += f"""
466
  ## Question {i}: {question.task_id} {status}
 
467
  **Question**: {question.question}
 
468
  **Model Answer**: {result.final_answer}
 
469
  **Expected Answer**: {question.final_answer if question.final_answer else 'N/A'}
 
470
  **Processing Time**: {result.processing_time:.2f}s
 
471
  **Level**: {question.level}
 
472
  ---
473
  """
474
 
@@ -526,7 +788,7 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
526
  questions = GAIADatasetManager.get_sample_questions()
527
  status_msg = f"✅ Loaded {len(questions)} sample questions"
528
  else:
529
- questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
530
 
531
  if max_questions and len(questions) > max_questions:
532
  questions = questions[:max_questions]
@@ -537,6 +799,138 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
537
 
538
  return summary, detailed, jsonl
539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  def get_model_info(model_choice: str):
541
  """Get information about selected model"""
542
  if model_choice in HFSpaceModelManager.SPACE_MODELS:
@@ -550,6 +944,17 @@ def get_model_info(model_choice: str):
550
  """
551
  return "Model information not available"
552
 
 
 
 
 
 
 
 
 
 
 
 
553
  # ================================
554
  # GRADIO APP CREATION
555
  # ================================
@@ -559,11 +964,26 @@ def create_gaia_app():
559
 
560
  with gr.Blocks(
561
  title="GAIA Benchmark AI Agent",
562
- theme=gr.themes.Soft()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  ) as app:
564
 
 
565
  gr.HTML("""
566
- <div style="text-align: center; font-size: 2.5em; font-weight: bold; margin-bottom: 20px;">
567
  🧠 GAIA Benchmark AI Agent
568
  </div>
569
  <p style="text-align: center; font-size: 1.2em; color: #666;">
@@ -571,9 +991,14 @@ def create_gaia_app():
571
  </p>
572
  """)
573
 
 
 
 
574
  with gr.Tabs():
575
 
 
576
  # TAB 1: MODEL SETUP
 
577
  with gr.Tab("🔧 Model Setup"):
578
  gr.Markdown("## Choose and Load Your Model")
579
 
@@ -582,7 +1007,8 @@ def create_gaia_app():
582
  model_dropdown = gr.Dropdown(
583
  choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
584
  value="Fast & Light",
585
- label="Select Model"
 
586
  )
587
 
588
  model_info = gr.Markdown(
@@ -594,9 +1020,12 @@ def create_gaia_app():
594
 
595
  with gr.Column(scale=1):
596
  gpu_info = gr.Markdown(f"""
597
- ### System Info
598
  **CUDA Available**: {torch.cuda.is_available()}
599
  {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
 
 
 
600
  """)
601
 
602
  model_status = gr.Textbox(
@@ -605,19 +1034,23 @@ def create_gaia_app():
605
  interactive=False
606
  )
607
 
 
608
  model_dropdown.change(
609
  fn=get_model_info,
610
  inputs=[model_dropdown],
611
  outputs=[model_info]
612
  )
613
 
 
614
  load_btn.click(
615
  fn=load_model_interface,
616
  inputs=[model_dropdown],
617
  outputs=[model_status]
618
  )
619
 
 
620
  # TAB 2: SINGLE QUESTION
 
621
  with gr.Tab("❓ Single Question"):
622
  gr.Markdown("## Test Individual Questions")
623
 
@@ -631,7 +1064,8 @@ def create_gaia_app():
631
 
632
  process_btn = gr.Button("🤔 Process Question", variant="primary")
633
 
634
- gr.Markdown("### Example Questions:")
 
635
  example_questions = [
636
  "What is the capital of France?",
637
  "Calculate 144 divided by 12",
@@ -670,13 +1104,16 @@ def create_gaia_app():
670
  interactive=False
671
  )
672
 
 
673
  process_btn.click(
674
  fn=single_question_interface,
675
  inputs=[question_input],
676
  outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
677
  )
678
 
 
679
  # TAB 3: BATCH EVALUATION
 
680
  with gr.Tab("📊 Batch Evaluation"):
681
  gr.Markdown("## Evaluate Multiple Questions")
682
 
@@ -684,15 +1121,17 @@ def create_gaia_app():
684
  dataset_choice = gr.Radio(
685
  choices=["Sample Questions", "GAIA Test Set"],
686
  value="Sample Questions",
687
- label="Dataset Choice"
 
688
  )
689
 
690
  max_questions = gr.Slider(
691
  minimum=1,
692
- maximum=50,
693
- value=5,
694
  step=1,
695
- label="Max Questions"
 
696
  )
697
 
698
  evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
@@ -715,9 +1154,11 @@ def create_gaia_app():
715
  value="Run an evaluation to see detailed results"
716
  )
717
 
 
718
  def batch_eval_with_download(*args):
719
  summary, detailed, jsonl_content = batch_evaluate_interface(*args)
720
 
 
721
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
722
  filename = f"gaia_results_{timestamp}.jsonl"
723
 
@@ -735,96 +1176,250 @@ def create_gaia_app():
735
  outputs=[download_output]
736
  )
737
 
 
738
  # TAB 4: FULL BENCHMARK
 
739
  with gr.Tab("🏆 Full Benchmark"):
740
- gr.Markdown("## Official GAIA Leaderboard Benchmark")
741
 
742
  with gr.Row():
743
  with gr.Column():
744
- test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
745
- test_preview_output = gr.Markdown(
746
- value="Click above to preview official test questions"
 
747
  )
748
 
749
- dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
750
- dataset_structure_output = gr.Markdown(
751
- value="Click above to see actual GAIA dataset structure"
752
- )
753
-
754
  with gr.Column():
755
- question_count = gr.Slider(
756
- minimum=10,
757
- maximum=300,
758
- value=20,
759
- step=10,
760
- label="Number of Questions"
761
- )
762
 
763
- selection_strategy = gr.Dropdown(
764
- choices=["balanced", "random", "sequential"],
765
- value="balanced",
766
- label="Selection Strategy"
767
- )
 
 
 
 
768
 
769
- benchmark_btn = gr.Button("🎯 Run Benchmark", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
770
 
 
 
 
 
 
 
 
771
  benchmark_status = gr.Textbox(
772
  label="📊 Benchmark Status",
773
- value="Ready to run benchmark",
774
  interactive=False
775
  )
776
 
777
  with gr.Row():
778
  with gr.Column():
779
  benchmark_report = gr.Markdown(
780
- label="📈 Benchmark Report",
781
- value="Run benchmark to see detailed results"
782
  )
783
 
784
  with gr.Column():
 
785
  submission_file = gr.File(
786
  label="💾 Download Submission File (JSONL)",
787
  visible=False
788
  )
789
 
790
  metadata_file = gr.File(
791
- label="📋 Download Metadata File",
792
  visible=False
793
  )
 
 
 
 
 
 
 
 
 
794
 
795
  # Event handlers
796
- test_preview_btn.click(
797
- fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
798
- outputs=[test_preview_output]
799
- )
800
-
801
- dataset_structure_btn.click(
802
- fn=preview_dataset_structure_interface,
803
- outputs=[dataset_structure_output]
804
  )
805
 
806
- def run_benchmark_wrapper(count, strategy, progress=gr.Progress()):
807
- return run_custom_benchmark_interface(count, strategy, progress)
808
-
809
- def show_download_files(status, report, sub_file, meta_file):
810
  return (
811
- status,
812
  report,
813
- sub_file,
814
  meta_file,
815
- gr.update(visible=True),
816
- gr.update(visible=True)
817
  )
818
 
819
- benchmark_btn.click(
820
- fn=run_benchmark_wrapper,
821
- inputs=[question_count, selection_strategy],
822
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
823
- ).then(
824
- fn=show_download_files,
825
- inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
826
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
 
 
827
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
 
829
  return app
830
 
@@ -833,6 +1428,18 @@ def create_gaia_app():
833
  # ================================
834
 
835
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
836
  app = create_gaia_app()
837
  app.launch(
838
  server_name="0.0.0.0",
 
1
  #!/usr/bin/env python3
2
  """
3
+ GAIA Benchmark AI Agent - Complete Standalone Version
4
+ ===================================================
5
  A Gradio-based web interface for running GAIA benchmark evaluations
6
+ with built-in dataset access and authentication.
7
  """
8
 
9
  import gradio as gr
 
27
  pipeline
28
  )
29
  from datasets import load_dataset
30
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_files
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Setup logging
33
  logging.basicConfig(level=logging.INFO)
34
  logger = logging.getLogger(__name__)
35
 
36
+ # ================================
37
+ # AUTHENTICATION SETUP
38
+ # ================================
39
+
40
+ def setup_hf_authentication():
41
+ """Setup HuggingFace authentication for GAIA dataset access"""
42
+ token = None
43
+
44
+ # Method 1: Environment variable
45
+ token = os.environ.get('HF_TOKEN')
46
+ if token:
47
+ logger.info("✅ Found HF_TOKEN in environment")
48
+ return token
49
+
50
+ # Method 2: HuggingFace CLI token
51
+ try:
52
+ from huggingface_hub import HfFolder
53
+ token = HfFolder.get_token()
54
+ if token:
55
+ logger.info("✅ Found token from HuggingFace CLI")
56
+ return token
57
+ except:
58
+ pass
59
+
60
+ # Method 3: Manual token file
61
+ token_path = os.path.expanduser("~/.cache/huggingface/token")
62
+ if os.path.exists(token_path):
63
+ try:
64
+ with open(token_path, 'r') as f:
65
+ token = f.read().strip()
66
+ if token:
67
+ logger.info("✅ Found token in cache file")
68
+ return token
69
+ except:
70
+ pass
71
+
72
+ logger.warning("⚠️ No HuggingFace token found - GAIA dataset access limited")
73
+ return None
74
+
75
+ # Initialize authentication
76
+ HF_TOKEN = setup_hf_authentication()
77
+
78
  # ================================
79
  # CORE DATA STRUCTURES
80
  # ================================
 
111
  """Manages GAIA-specific prompting and formatting"""
112
 
113
  GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
114
+
115
  FINAL ANSWER: [YOUR FINAL ANSWER]
116
+
117
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
118
 
119
  @staticmethod
 
267
  return f"❌ Error generating response: {str(e)}"
268
 
269
  # ================================
270
+ # ENHANCED DATASET MANAGEMENT WITH GAIA ACCESS
271
  # ================================
272
 
273
  class GAIADatasetManager:
274
+ """Manages GAIA dataset loading with authentication and fallbacks"""
275
 
276
  @staticmethod
277
+ def test_gaia_access() -> Tuple[bool, str]:
278
+ """Test if we can access the GAIA dataset"""
279
+ if not HF_TOKEN:
280
+ return False, "No authentication token found"
281
+
282
  try:
283
+ # Try to load just one item to test access
284
+ dataset = load_dataset(
285
+ "gaia-benchmark/GAIA",
286
+ split="validation",
287
+ token=HF_TOKEN,
288
+ trust_remote_code=True
289
+ )
290
+ if len(dataset) > 0:
291
+ return True, f"✅ GAIA dataset accessible ({len(dataset)} validation questions)"
292
+ else:
293
+ return False, "Dataset empty"
294
+ except Exception as e:
295
+ return False, f"Access failed: {str(e)}"
296
+
297
+ @staticmethod
298
+ def get_gaia_splits() -> List[str]:
299
+ """Get available GAIA dataset splits"""
300
+ if not HF_TOKEN:
301
+ return []
302
+
303
+ try:
304
+ from datasets import get_dataset_config_names, get_dataset_split_names
305
+ splits = get_dataset_split_names("gaia-benchmark/GAIA", token=HF_TOKEN)
306
+ return splits
307
+ except:
308
+ # Common GAIA splits based on documentation
309
+ return ["validation", "test"]
310
+
311
+ @staticmethod
312
+ def load_gaia_dataset(split: str = "validation", max_questions: int = None) -> Tuple[List[GAIAQuestion], str]:
313
+ """Load GAIA dataset from Hugging Face Hub with robust error handling"""
314
+ try:
315
+ logger.info(f"Attempting to load GAIA dataset split: {split}")
316
+
317
+ if not HF_TOKEN:
318
+ logger.warning("No HF_TOKEN found, falling back to sample questions")
319
+ questions = GAIADatasetManager.get_sample_questions()
320
+ return questions[:max_questions] if max_questions else questions, "⚠️ No authentication - using sample questions"
321
+
322
+ # Test access first
323
+ has_access, access_msg = GAIADatasetManager.test_gaia_access()
324
+ if not has_access:
325
+ logger.warning(f"GAIA access test failed: {access_msg}")
326
+ questions = GAIADatasetManager.get_sample_questions()
327
+ return questions[:max_questions] if max_questions else questions, f"⚠️ {access_msg} - using sample questions"
328
+
329
+ # Load the actual dataset
330
+ dataset = load_dataset(
331
+ "gaia-benchmark/GAIA",
332
+ split=split,
333
+ token=HF_TOKEN,
334
+ trust_remote_code=True
335
+ )
336
+
337
+ logger.info(f"Successfully loaded GAIA dataset: {len(dataset)} items")
338
 
339
  questions = []
340
  items = dataset[:max_questions] if max_questions else dataset
341
 
342
  for i, item in enumerate(items):
343
+ # Handle different possible field names in GAIA dataset
344
+ task_id = (item.get('task_id') or
345
+ item.get('Task ID') or
346
+ item.get('id') or
347
+ f'gaia_{split}_{i:03d}')
348
+
349
+ question_text = (item.get('Question') or
350
+ item.get('question') or
351
+ item.get('input') or
352
+ 'No question text available')
353
+
354
+ level = (item.get('Level') or
355
+ item.get('level') or
356
+ item.get('difficulty') or
357
+ 1)
358
+
359
+ final_answer = (item.get('Final answer') or
360
+ item.get('final_answer') or
361
+ item.get('answer') or
362
+ item.get('target') or
363
+ None)
364
+
365
+ file_name = (item.get('file_name') or
366
+ item.get('File name') or
367
+ item.get('files') or
368
+ None)
369
+
370
+ annotator_metadata = (item.get('Annotator Metadata') or
371
+ item.get('annotator_metadata') or
372
+ item.get('metadata') or
373
+ None)
374
+
375
  question = GAIAQuestion(
376
+ task_id=str(task_id),
377
+ question=str(question_text),
378
+ level=int(level),
379
+ final_answer=str(final_answer) if final_answer else None,
380
+ file_name=str(file_name) if file_name else None,
381
+ annotator_metadata=annotator_metadata
382
  )
383
  questions.append(question)
384
 
385
  status = f"✅ Loaded {len(questions)} questions from GAIA {split} split"
386
+ logger.info(status)
387
  return questions, status
388
 
389
  except Exception as e:
390
  error_msg = f"❌ Error loading GAIA dataset: {str(e)}"
391
+ logger.error(error_msg)
392
+
393
+ # Fallback to sample questions
394
+ logger.info("Falling back to sample questions")
395
+ questions = GAIADatasetManager.get_sample_questions()
396
+ return questions[:max_questions] if max_questions else questions, f"{error_msg} (Using sample questions instead)"
397
 
398
  @staticmethod
399
  def get_sample_questions() -> List[GAIAQuestion]:
400
+ """Get sample questions for testing when GAIA dataset is not accessible"""
401
  sample_data = [
402
  {
403
  "task_id": "sample_001",
 
446
  "question": "How many continents are there?",
447
  "level": 1,
448
  "final_answer": "7"
449
+ },
450
+ {
451
+ "task_id": "sample_009",
452
+ "question": "What is 25% of 200?",
453
+ "level": 1,
454
+ "final_answer": "50"
455
+ },
456
+ {
457
+ "task_id": "sample_010",
458
+ "question": "In which year did World War II end?",
459
+ "level": 1,
460
+ "final_answer": "1945"
461
+ },
462
+ {
463
+ "task_id": "sample_011",
464
+ "question": "What is the square root of 144?",
465
+ "level": 2,
466
+ "final_answer": "12"
467
+ },
468
+ {
469
+ "task_id": "sample_012",
470
+ "question": "Name the three primary colors.",
471
+ "level": 1,
472
+ "final_answer": "red, blue, yellow"
473
  }
474
  ]
475
 
476
  return [GAIAQuestion.from_dict(data) for data in sample_data]
477
+
478
+ @staticmethod
479
+ def preview_gaia_dataset() -> str:
480
+ """Preview GAIA dataset structure and content"""
481
+ if not HF_TOKEN:
482
+ return """
483
+ ## ⚠️ GAIA Dataset Preview - Authentication Required
484
+
485
+ To access the GAIA dataset, you need:
486
+
487
+ 1. **Request Access**: https://huggingface.co/datasets/gaia-benchmark/GAIA
488
+ 2. **Get Token**: https://huggingface.co/settings/tokens
489
+ 3. **Set Token**: `export HF_TOKEN=your_token_here`
490
+
491
+ ### 📋 Sample Questions Available:
492
+ We provide 12 sample questions for testing your setup without authentication.
493
+ Use "Sample Questions" in the evaluation tabs to get started!
494
+ """
495
+
496
+ try:
497
+ # Test access and get basic info
498
+ has_access, access_msg = GAIADatasetManager.test_gaia_access()
499
+
500
+ if not has_access:
501
+ return f"""
502
+ ## ❌ GAIA Dataset Access Failed
503
+
504
+ **Error**: {access_msg}
505
+
506
+ ### 🔧 Troubleshooting:
507
+ 1. Check your HF_TOKEN is valid
508
+ 2. Ensure you have access to GAIA dataset
509
+ 3. Try refreshing your token
510
+
511
+ ### 🔄 Alternative:
512
+ Use "Sample Questions" for testing without authentication.
513
+ """
514
+
515
+ # Try to get some preview data
516
+ dataset = load_dataset(
517
+ "gaia-benchmark/GAIA",
518
+ split="validation",
519
+ token=HF_TOKEN,
520
+ trust_remote_code=True
521
+ )
522
+
523
+ # Analyze the dataset
524
+ total_questions = len(dataset)
525
+
526
+ # Get level distribution
527
+ levels = {}
528
+ sample_questions = []
529
+
530
+ for i, item in enumerate(dataset):
531
+ level = item.get('Level', 1)
532
+ levels[level] = levels.get(level, 0) + 1
533
+
534
+ # Collect a few sample questions
535
+ if len(sample_questions) < 3:
536
+ question_text = item.get('Question', 'No question')
537
+ if len(question_text) > 100:
538
+ question_text = question_text[:100] + "..."
539
+ sample_questions.append(f"- **Level {level}**: {question_text}")
540
+
541
+ level_dist = "\n".join([f"- **Level {k}**: {v} questions" for k, v in sorted(levels.items())])
542
+ sample_text = "\n".join(sample_questions)
543
+
544
+ return f"""
545
+ ## ✅ GAIA Dataset Preview - Access Confirmed
546
+
547
+ ### 📊 Dataset Statistics:
548
+ - **Total Questions**: {total_questions}
549
+ - **Available Split**: validation (development set)
550
+
551
+ ### 📈 Level Distribution:
552
+ {level_dist}
553
+
554
+ ### 📋 Sample Questions:
555
+ {sample_text}
556
+
557
+ ### 🎯 Ready for Evaluation!
558
+ You can now use "GAIA Test Set" in the evaluation tabs to test your model on real GAIA questions.
559
+ """
560
+
561
+ except Exception as e:
562
+ return f"""
563
+ ## ❌ Error Previewing GAIA Dataset
564
+
565
+ **Error**: {str(e)}
566
+
567
+ ### 🔄 Recommendations:
568
+ 1. Use "Sample Questions" for immediate testing
569
+ 2. Check your authentication setup
570
+ 3. Try again in a few minutes
571
+
572
+ ### 📞 Need Help?
573
+ - GAIA Dataset: https://huggingface.co/datasets/gaia-benchmark/GAIA
574
+ - HF Authentication: https://huggingface.co/docs/hub/security-tokens
575
+ """
576
 
577
  # ================================
578
  # MAIN GAIA AGENT FOR HF SPACES
 
692
 
693
  summary = f"""
694
  # 📊 GAIA Evaluation Summary
695
+
696
  ## Overall Statistics
697
  - **Total Questions**: {total}
698
  - **Successful**: {successful}
699
  - **Errors**: {errors}
700
  - **Success Rate**: {(successful/total*100):.1f}%
701
+
702
  ## Performance Metrics
703
  - **Average Processing Time**: {avg_time:.2f}s
704
  - **Total Processing Time**: {total_time:.2f}s
705
  - **Questions per Minute**: {(total/(total_time/60)):.1f}
706
+
707
  ## Model Information
708
  - **Model**: {self.current_model}
709
  - **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'}
710
+ - **Authentication**: {'✅ GAIA Access' if HF_TOKEN else '⚠️ Sample Data Only'}
711
  """
712
  return summary
713
 
 
720
 
721
  detailed += f"""
722
  ## Question {i}: {question.task_id} {status}
723
+
724
  **Question**: {question.question}
725
+
726
  **Model Answer**: {result.final_answer}
727
+
728
  **Expected Answer**: {question.final_answer if question.final_answer else 'N/A'}
729
+
730
  **Processing Time**: {result.processing_time:.2f}s
731
+
732
  **Level**: {question.level}
733
+
734
  ---
735
  """
736
 
 
788
  questions = GAIADatasetManager.get_sample_questions()
789
  status_msg = f"✅ Loaded {len(questions)} sample questions"
790
  else:
791
+ questions, status_msg = GAIADatasetManager.load_gaia_dataset("validation", max_questions)
792
 
793
  if max_questions and len(questions) > max_questions:
794
  questions = questions[:max_questions]
 
799
 
800
  return summary, detailed, jsonl
801
 
802
+ def run_full_benchmark_interface(progress=gr.Progress()):
803
+ """Run full benchmark on GAIA test set"""
804
+ try:
805
+ if gaia_agent.model_manager is None:
806
+ return (
807
+ "❌ No model loaded. Please load a model first.",
808
+ "Load a model in the 'Model Setup' tab before running benchmarks.",
809
+ None,
810
+ None
811
+ )
812
+
813
+ progress(0.1, desc="Loading GAIA test dataset...")
814
+
815
+ # Try to load the test set (or validation if test is not available)
816
+ test_questions, test_status = GAIADatasetManager.load_gaia_dataset("test", None)
817
+
818
+ if "Error" in test_status or not test_questions:
819
+ # Fallback to validation set
820
+ progress(0.15, desc="Test set not available, using validation set...")
821
+ test_questions, test_status = GAIADatasetManager.load_gaia_dataset("validation", None)
822
+
823
+ if not test_questions:
824
+ return (
825
+ "❌ No questions available for benchmarking",
826
+ "Unable to load GAIA dataset. Check your authentication and try 'Sample Questions' first.",
827
+ None,
828
+ None
829
+ )
830
+
831
+ progress(0.2, desc=f"Starting full benchmark on {len(test_questions)} questions...")
832
+
833
+ # Run the full evaluation
834
+ summary, detailed, jsonl_content = gaia_agent.batch_evaluate(test_questions, progress)
835
+
836
+ # Generate submission files
837
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
838
+
839
+ # Create submission file for leaderboard
840
+ submission_filename = f"gaia_submission_{timestamp}.jsonl"
841
+ with open(submission_filename, 'w') as f:
842
+ f.write(jsonl_content)
843
+
844
+ # Create metadata file
845
+ metadata = {
846
+ "submission_type": "full_benchmark",
847
+ "model_name": gaia_agent.current_model,
848
+ "timestamp": timestamp,
849
+ "num_questions": len(test_questions),
850
+ "dataset_split": "test" if "test" in test_status else "validation",
851
+ "dataset_status": test_status,
852
+ "device": gaia_agent.model_manager.device if gaia_agent.model_manager else "unknown",
853
+ "authentication": "authenticated" if HF_TOKEN else "sample_data"
854
+ }
855
+
856
+ metadata_filename = f"gaia_metadata_{timestamp}.json"
857
+ with open(metadata_filename, 'w') as f:
858
+ json.dump(metadata, f, indent=2)
859
+
860
+ # Enhanced benchmark report
861
+ enhanced_summary = f"""
862
+ # 🏆 GAIA Full Benchmark Results
863
+
864
+ ## 📊 Dataset Information
865
+ {test_status}
866
+
867
+ ## 🎯 Benchmark Configuration
868
+ - **Type**: Full GAIA Benchmark
869
+ - **Questions**: {len(test_questions)}
870
+ - **Model**: {gaia_agent.current_model}
871
+ - **Dataset Split**: {"Test" if "test" in test_status else "Validation"}
872
+ - **Timestamp**: {timestamp}
873
+
874
+ {summary}
875
+
876
+ ## 📤 Leaderboard Submission Ready!
877
+ Your benchmark is complete and submission files are ready:
878
+
879
+ ### 📁 Generated Files:
880
+ - **Submission JSONL**: `{submission_filename}`
881
+ - **Metadata JSON**: `{metadata_filename}`
882
+
883
+ ### 🚀 Next Steps:
884
+ 1. **Download** the JSONL file above
885
+ 2. **Visit** the GAIA Leaderboard: https://huggingface.co/spaces/gaia-benchmark/leaderboard
886
+ 3. **Upload** your submission file
887
+ 4. **View** your model's official ranking!
888
+
889
+ ## 🎯 Performance Context
890
+ Your model will be ranked against:
891
+ - **Top Models**: GPT-4 + plugins (~15-20%)
892
+ - **Strong Models**: Claude-3, Gemini Pro (~10-18%)
893
+ - **Human Performance**: ~92% accuracy
894
+ - **Community Average**: ~5-15%
895
+
896
+ Congratulations on completing the full GAIA benchmark! 🎉
897
+ """
898
+
899
+ return (
900
+ f"✅ Full benchmark completed! Evaluated {len(test_questions)} questions.",
901
+ enhanced_summary,
902
+ submission_filename,
903
+ metadata_filename
904
+ )
905
+
906
+ except Exception as e:
907
+ error_msg = f"❌ Full benchmark failed: {str(e)}"
908
+ logger.error(error_msg, exc_info=True)
909
+
910
+ return (
911
+ error_msg,
912
+ f"""
913
+ # ❌ Benchmark Error
914
+
915
+ **Error**: {str(e)}
916
+
917
+ ## 🔧 Troubleshooting Steps:
918
+ 1. **Load a model** in "Model Setup" tab first
919
+ 2. **Test with small batch** in "Batch Evaluation"
920
+ 3. **Use "Sample Questions"** to verify setup
921
+ 4. **Check authentication** if using GAIA dataset
922
+
923
+ ## 🔄 Alternative Approach:
924
+ Try "Batch Evaluation" → "GAIA Test Set" → 10-20 questions first.
925
+ """,
926
+ None,
927
+ None
928
+ )
929
+
930
+ def preview_gaia_interface():
931
+ """Interface for previewing GAIA dataset"""
932
+ return GAIADatasetManager.preview_gaia_dataset()
933
+
934
  def get_model_info(model_choice: str):
935
  """Get information about selected model"""
936
  if model_choice in HFSpaceModelManager.SPACE_MODELS:
 
944
  """
945
  return "Model information not available"
946
 
947
+ def get_auth_status():
948
+ """Get current authentication status"""
949
+ if HF_TOKEN:
950
+ has_access, msg = GAIADatasetManager.test_gaia_access()
951
+ if has_access:
952
+ return f"✅ **Authenticated & GAIA Access Confirmed**\n{msg}"
953
+ else:
954
+ return f"⚠️ **Authenticated but GAIA Access Failed**\n{msg}"
955
+ else:
956
+ return "❌ **Not Authenticated** - Using sample questions only\n\nTo access GAIA dataset:\n1. Get access: https://huggingface.co/datasets/gaia-benchmark/GAIA\n2. Set HF_TOKEN environment variable"
957
+
958
  # ================================
959
  # GRADIO APP CREATION
960
  # ================================
 
964
 
965
  with gr.Blocks(
966
  title="GAIA Benchmark AI Agent",
967
+ theme=gr.themes.Soft(),
968
+ css="""
969
+ .gradio-container {
970
+ font-family: 'Arial', sans-serif;
971
+ }
972
+ .main-header {
973
+ text-align: center;
974
+ background: linear-gradient(45deg, #2196F3, #21CBF3);
975
+ -webkit-background-clip: text;
976
+ -webkit-text-fill-color: transparent;
977
+ font-size: 2.5em;
978
+ font-weight: bold;
979
+ margin-bottom: 20px;
980
+ }
981
+ """
982
  ) as app:
983
 
984
+ # Header
985
  gr.HTML("""
986
+ <div class="main-header">
987
  🧠 GAIA Benchmark AI Agent
988
  </div>
989
  <p style="text-align: center; font-size: 1.2em; color: #666;">
 
991
  </p>
992
  """)
993
 
994
+ # Authentication status at the top
995
+ auth_status_display = gr.Markdown(value=get_auth_status())
996
+
997
  with gr.Tabs():
998
 
999
+ # ===============================
1000
  # TAB 1: MODEL SETUP
1001
+ # ===============================
1002
  with gr.Tab("🔧 Model Setup"):
1003
  gr.Markdown("## Choose and Load Your Model")
1004
 
 
1007
  model_dropdown = gr.Dropdown(
1008
  choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
1009
  value="Fast & Light",
1010
+ label="Select Model",
1011
+ info="Choose based on your quality vs speed preference"
1012
  )
1013
 
1014
  model_info = gr.Markdown(
 
1020
 
1021
  with gr.Column(scale=1):
1022
  gpu_info = gr.Markdown(f"""
1023
+ ### 🖥️ System Info
1024
  **CUDA Available**: {torch.cuda.is_available()}
1025
  {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
1026
+
1027
+ ### 🔐 Dataset Access
1028
+ {get_auth_status()}
1029
  """)
1030
 
1031
  model_status = gr.Textbox(
 
1034
  interactive=False
1035
  )
1036
 
1037
+ # Update model info when selection changes
1038
  model_dropdown.change(
1039
  fn=get_model_info,
1040
  inputs=[model_dropdown],
1041
  outputs=[model_info]
1042
  )
1043
 
1044
+ # Load model when button clicked
1045
  load_btn.click(
1046
  fn=load_model_interface,
1047
  inputs=[model_dropdown],
1048
  outputs=[model_status]
1049
  )
1050
 
1051
+ # ===============================
1052
  # TAB 2: SINGLE QUESTION
1053
+ # ===============================
1054
  with gr.Tab("❓ Single Question"):
1055
  gr.Markdown("## Test Individual Questions")
1056
 
 
1064
 
1065
  process_btn = gr.Button("🤔 Process Question", variant="primary")
1066
 
1067
+ # Example questions
1068
+ gr.Markdown("### 💡 Example Questions:")
1069
  example_questions = [
1070
  "What is the capital of France?",
1071
  "Calculate 144 divided by 12",
 
1104
  interactive=False
1105
  )
1106
 
1107
+ # Process single question
1108
  process_btn.click(
1109
  fn=single_question_interface,
1110
  inputs=[question_input],
1111
  outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
1112
  )
1113
 
1114
+ # ===============================
1115
  # TAB 3: BATCH EVALUATION
1116
+ # ===============================
1117
  with gr.Tab("📊 Batch Evaluation"):
1118
  gr.Markdown("## Evaluate Multiple Questions")
1119
 
 
1121
  dataset_choice = gr.Radio(
1122
  choices=["Sample Questions", "GAIA Test Set"],
1123
  value="Sample Questions",
1124
+ label="Dataset Choice",
1125
+ info="Sample Questions work without authentication"
1126
  )
1127
 
1128
  max_questions = gr.Slider(
1129
  minimum=1,
1130
+ maximum=100,
1131
+ value=10,
1132
  step=1,
1133
+ label="Max Questions",
1134
+ info="Number of questions to evaluate"
1135
  )
1136
 
1137
  evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
 
1154
  value="Run an evaluation to see detailed results"
1155
  )
1156
 
1157
+ # Batch evaluation
1158
  def batch_eval_with_download(*args):
1159
  summary, detailed, jsonl_content = batch_evaluate_interface(*args)
1160
 
1161
+ # Save JSONL for download
1162
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1163
  filename = f"gaia_results_{timestamp}.jsonl"
1164
 
 
1176
  outputs=[download_output]
1177
  )
1178
 
1179
+ # ===============================
1180
  # TAB 4: FULL BENCHMARK
1181
+ # ===============================
1182
  with gr.Tab("🏆 Full Benchmark"):
1183
+ gr.Markdown("## Official GAIA Benchmark & Leaderboard Submission")
1184
 
1185
  with gr.Row():
1186
  with gr.Column():
1187
+ # GAIA dataset preview
1188
+ preview_btn = gr.Button("🔍 Preview GAIA Dataset", variant="secondary")
1189
+ preview_output = gr.Markdown(
1190
+ value="Click above to preview the GAIA dataset structure and access status"
1191
  )
1192
 
 
 
 
 
 
1193
  with gr.Column():
1194
+ gr.Markdown("""
1195
+ ### 🏆 GAIA Leaderboard Info
 
 
 
 
 
1196
 
1197
+ **What is GAIA?**
1198
+ - 450+ real-world assistant questions
1199
+ - 3 difficulty levels (basic → advanced)
1200
+ - Requires reasoning, tool use, multi-modality
1201
+
1202
+ **Current Leaderboard:**
1203
+ - **Best Models**: ~15-20% accuracy
1204
+ - **Human Performance**: ~92% accuracy
1205
+ - **Your Goal**: Beat the current best!
1206
 
1207
+ **Official Leaderboard:**
1208
+ https://huggingface.co/spaces/gaia-benchmark/leaderboard
1209
+ """)
1210
+
1211
+ gr.Markdown("### 🚀 Run Full Benchmark")
1212
+
1213
+ gr.Markdown("""
1214
+ **⚠️ Important Notes:**
1215
+ - This will evaluate your model on the complete GAIA dataset
1216
+ - May take 1-3 hours depending on model and hardware
1217
+ - Generates official leaderboard submission files
1218
+ - Test with smaller batches first to verify your setup
1219
+ """)
1220
 
1221
+ full_benchmark_btn = gr.Button(
1222
+ "🏆 Start Full GAIA Benchmark",
1223
+ variant="primary",
1224
+ size="lg"
1225
+ )
1226
+
1227
+ # Results section
1228
  benchmark_status = gr.Textbox(
1229
  label="📊 Benchmark Status",
1230
+ value="Ready to run full benchmark",
1231
  interactive=False
1232
  )
1233
 
1234
  with gr.Row():
1235
  with gr.Column():
1236
  benchmark_report = gr.Markdown(
1237
+ label="📈 Benchmark Report",
1238
+ value="Run benchmark to see detailed results and leaderboard submission files"
1239
  )
1240
 
1241
  with gr.Column():
1242
+ # Download files
1243
  submission_file = gr.File(
1244
  label="💾 Download Submission File (JSONL)",
1245
  visible=False
1246
  )
1247
 
1248
  metadata_file = gr.File(
1249
+ label="📋 Download Metadata File",
1250
  visible=False
1251
  )
1252
+
1253
+ gr.Markdown("""
1254
+ ### 📤 Leaderboard Submission Steps
1255
+ 1. **Run** the full benchmark above
1256
+ 2. **Download** the JSONL submission file
1257
+ 3. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
1258
+ 4. **Upload** your submission file
1259
+ 5. **View** your official ranking!
1260
+ """)
1261
 
1262
  # Event handlers
1263
+ preview_btn.click(
1264
+ fn=preview_gaia_interface,
1265
+ outputs=[preview_output]
 
 
 
 
 
1266
  )
1267
 
1268
+ def full_benchmark_with_files(*args):
1269
+ status, report, sub_file, meta_file = run_full_benchmark_interface(*args)
 
 
1270
  return (
1271
+ status,
1272
  report,
1273
+ sub_file,
1274
  meta_file,
1275
+ gr.update(visible=True if sub_file else False),
1276
+ gr.update(visible=True if meta_file else False)
1277
  )
1278
 
1279
+ full_benchmark_btn.click(
1280
+ fn=full_benchmark_with_files,
1281
+ outputs=[
1282
+ benchmark_status,
1283
+ benchmark_report,
1284
+ submission_file,
1285
+ metadata_file,
1286
+ submission_file, # Update visibility
1287
+ metadata_file # Update visibility
1288
+ ]
1289
  )
1290
+
1291
+ # ===============================
1292
+ # TAB 5: HELP & INFO
1293
+ # ===============================
1294
+ with gr.Tab("ℹ️ Help & Info"):
1295
+ gr.Markdown("""
1296
+ # 🧠 GAIA Benchmark AI Agent - Complete Guide
1297
+
1298
+ ## 🎯 Quick Start Guide
1299
+
1300
+ ### 1. **Model Setup** (Required First!)
1301
+ - Choose a model based on your needs
1302
+ - **Fast & Light**: Good for testing, works on CPU
1303
+ - **High Quality**: Best results, requires GPU
1304
+ - Click "Load Model" and wait for success message
1305
+
1306
+ ### 2. **Test Your Setup**
1307
+ - Go to "Single Question" tab
1308
+ - Try example questions like "What is the capital of France?"
1309
+ - Verify your model responds correctly
1310
+
1311
+ ### 3. **Small Batch Test**
1312
+ - Go to "Batch Evaluation" tab
1313
+ - Select "Sample Questions" (works without authentication)
1314
+ - Start with 5-10 questions
1315
+ - Check that evaluation completes and files download
1316
+
1317
+ ### 4. **GAIA Dataset Access** (Optional but Recommended)
1318
+ ```bash
1319
+ # Get your token from https://huggingface.co/settings/tokens
1320
+ export HF_TOKEN=hf_your_token_here
1321
+
1322
+ # Or login via CLI
1323
+ huggingface-cli login
1324
+ ```
1325
+
1326
+ ### 5. **Full Benchmark** (Advanced)
1327
+ - Go to "Full Benchmark" tab
1328
+ - Preview GAIA dataset to confirm access
1329
+ - Run complete evaluation for leaderboard submission
1330
+
1331
+ ## 📊 What is GAIA?
1332
+
1333
+ **GAIA (General AI Assistant)** tests AI on real-world tasks requiring:
1334
+ - **Multi-step reasoning**: Complex logical thinking
1335
+ - **Tool use**: Web browsing, calculations, file processing
1336
+ - **Multi-modality**: Text, images, PDFs, spreadsheets
1337
+ - **Real-world knowledge**: Current events, specialized domains
1338
+
1339
+ ## 🏆 Performance Expectations
1340
+
1341
+ | Model Type | Expected Accuracy | Notes |
1342
+ |------------|------------------|-------|
1343
+ | **Top Commercial** | 15-20% | GPT-4 + plugins, Claude-3 |
1344
+ | **Good Open Source** | 8-15% | Llama-2-70B, Mixtral-8x7B |
1345
+ | **Smaller Models** | 3-8% | 7B parameter models |
1346
+ | **Basic Models** | 1-5% | 3B parameter models |
1347
+ | **Humans** | ~92% | Average human performance |
1348
+
1349
+ ## 🔧 Troubleshooting
1350
+
1351
+ ### Model Loading Issues
1352
+ - **Out of Memory**: Try "Fast & Light" model
1353
+ - **CUDA Errors**: Restart and use CPU mode
1354
+ - **Download Fails**: Check internet connection
1355
+
1356
+ ### Dataset Access Issues
1357
+ - **401 Unauthorized**: Set HF_TOKEN environment variable
1358
+ - **403 Forbidden**: Request GAIA dataset access first
1359
+ - **No Results**: Use "Sample Questions" to test setup
1360
+
1361
+ ### Evaluation Issues
1362
+ - **No Progress**: Ensure model is loaded first
1363
+ - **Errors**: Check model compatibility and memory
1364
+ - **Slow Performance**: Normal for larger models/datasets
1365
+
1366
+ ## 📁 File Formats
1367
+
1368
+ **Submission JSONL Format:**
1369
+ ```json
1370
+ {"task_id": "gaia_001", "model_answer": "Full response...", "reasoning_trace": "Step by step..."}
1371
+ {"task_id": "gaia_002", "model_answer": "Full response...", "reasoning_trace": "Step by step..."}
1372
+ ```
1373
+
1374
+ **Metadata JSON Format:**
1375
+ ```json
1376
+ {
1377
+ "model_name": "High Quality",
1378
+ "timestamp": "20240604_143022",
1379
+ "num_questions": 450,
1380
+ "dataset_split": "test"
1381
+ }
1382
+ ```
1383
+
1384
+ ## 🚀 Pro Tips
1385
+
1386
+ ### For Best Results:
1387
+ 1. **Start Small**: Always test with sample questions first
1388
+ 2. **Monitor Resources**: Check GPU memory during evaluation
1389
+ 3. **Save Progress**: Download intermediate results frequently
1390
+ 4. **Quality Over Speed**: Use better models for leaderboard submissions
1391
+ 5. **Analyze Failures**: Review reasoning traces to understand errors
1392
+
1393
+ ### For Leaderboard Success:
1394
+ 1. **Test Thoroughly**: Verify setup with small batches
1395
+ 2. **Use Best Model**: Don't compromise on model quality
1396
+ 3. **Check Format**: Ensure JSONL files are valid
1397
+ 4. **Include Metadata**: Helps with debugging and analysis
1398
+ 5. **Document Approach**: Note any special techniques used
1399
+
1400
+ ## 🔗 Important Links
1401
+
1402
+ - **GAIA Dataset**: https://huggingface.co/datasets/gaia-benchmark/GAIA
1403
+ - **GAIA Leaderboard**: https://huggingface.co/spaces/gaia-benchmark/leaderboard
1404
+ - **GAIA Paper**: https://arxiv.org/abs/2311.12983
1405
+ - **HuggingFace Tokens**: https://huggingface.co/settings/tokens
1406
+ - **Authentication Guide**: https://huggingface.co/docs/hub/security-tokens
1407
+
1408
+ ## 🎉 Success Checklist
1409
+
1410
+ - [ ] Model loads successfully
1411
+ - [ ] Single question works
1412
+ - [ ] Batch evaluation completes
1413
+ - [ ] Files download properly
1414
+ - [ ] GAIA dataset access (optional)
1415
+ - [ ] Full benchmark completes
1416
+ - [ ] Submission files ready
1417
+ - [ ] Uploaded to leaderboard
1418
+
1419
+ ---
1420
+
1421
+ **Ready to benchmark?** Start with Model Setup and work through each tab systematically. Good luck! 🚀
1422
+ """)
1423
 
1424
  return app
1425
 
 
1428
  # ================================
1429
 
1430
  if __name__ == "__main__":
1431
+ # Print startup information
1432
+ print("🧠 GAIA Benchmark AI Agent Starting...")
1433
+ print(f"🔐 Authentication: {'✅ Found HF_TOKEN' if HF_TOKEN else '⚠️ No HF_TOKEN (sample questions only)'}")
1434
+ print(f"🖥️ CUDA Available: {'✅ Yes' if torch.cuda.is_available() else '❌ No (CPU only)'}")
1435
+ if torch.cuda.is_available():
1436
+ print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
1437
+
1438
+ # Test GAIA access if token available
1439
+ if HF_TOKEN:
1440
+ has_access, access_msg = GAIADatasetManager.test_gaia_access()
1441
+ print(f"📊 GAIA Dataset: {'✅ Accessible' if has_access else '⚠️ ' + access_msg}")
1442
+
1443
  app = create_gaia_app()
1444
  app.launch(
1445
  server_name="0.0.0.0",