Ashokdll commited on
Commit
310a013
·
verified ·
1 Parent(s): ba63dbb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -349
app.py CHANGED
@@ -30,14 +30,14 @@ from transformers import (
30
  from datasets import load_dataset
31
  from huggingface_hub import HfApi, hf_hub_download
32
 
33
- # Import leaderboard integration (CORRECTED IMPORTS)
34
  from gaia_leaderboard_integration import (
35
  enhanced_gaia_agent,
36
- run_custom_benchmark_interface, # ← FIXED: was run_leaderboard_benchmark_interface
37
  load_test_questions_interface,
38
- preview_dataset_structure_interface, # ← NEW FUNCTION
39
  get_leaderboard_info,
40
- get_question_selection_info # ← NEW FUNCTION
41
  )
42
 
43
  # Setup logging
@@ -45,6 +45,16 @@ logging.basicConfig(level=logging.INFO)
45
  logger = logging.getLogger(__name__)
46
 
47
  # ================================
 
 
 
 
 
 
 
 
 
 
48
  # CORE DATA STRUCTURES
49
  # ================================
50
 
@@ -114,7 +124,6 @@ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma sepa
114
  class HFSpaceModelManager:
115
  """Hugging Face Spaces optimized model manager"""
116
 
117
- # Space-friendly models with different capabilities
118
  SPACE_MODELS = {
119
  "Fast & Light": {
120
  "name": "microsoft/DialoGPT-medium",
@@ -160,7 +169,6 @@ class HFSpaceModelManager:
160
  if progress_callback:
161
  progress_callback(0.1, "Loading tokenizer...")
162
 
163
- # Load tokenizer
164
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
165
  if self.tokenizer.pad_token is None:
166
  self.tokenizer.pad_token = self.tokenizer.eos_token
@@ -168,7 +176,6 @@ class HFSpaceModelManager:
168
  if progress_callback:
169
  progress_callback(0.3, "Configuring model...")
170
 
171
- # Configure quantization for GPU spaces
172
  quantization_config = None
173
  if self.device == "cuda" and "7b" in self.model_name.lower():
174
  quantization_config = BitsAndBytesConfig(
@@ -181,7 +188,6 @@ class HFSpaceModelManager:
181
  if progress_callback:
182
  progress_callback(0.6, "Loading model weights...")
183
 
184
- # Load model
185
  self.model = AutoModelForCausalLM.from_pretrained(
186
  self.model_name,
187
  quantization_config=quantization_config,
@@ -193,7 +199,6 @@ class HFSpaceModelManager:
193
  if progress_callback:
194
  progress_callback(0.9, "Creating pipeline...")
195
 
196
- # Create pipeline
197
  self.pipeline = pipeline(
198
  "text-generation",
199
  model=self.model,
@@ -221,7 +226,6 @@ class HFSpaceModelManager:
221
  return "❌ Model not loaded. Please load a model first."
222
 
223
  try:
224
- # Truncate prompt if too long
225
  max_input_length = 1000
226
  if len(prompt) > max_input_length:
227
  prompt = prompt[:max_input_length] + "..."
@@ -351,13 +355,10 @@ class GAIASpaceAgent:
351
  self.model_manager = HFSpaceModelManager(model_choice)
352
  self.current_model = model_choice
353
 
354
- # Load model with progress updates
355
  def progress_callback(value, desc):
356
  progress(value, desc=desc)
357
 
358
  result = self.model_manager.load_model(progress_callback)
359
-
360
- # Clear any previous results when changing models
361
  self.evaluation_results = []
362
 
363
  return result
@@ -374,22 +375,15 @@ class GAIASpaceAgent:
374
 
375
  try:
376
  progress(0.2, desc="Creating GAIA prompt...")
377
-
378
- # Create GAIA prompt
379
  prompt = self.prompt_manager.create_gaia_prompt(question_text)
380
 
381
  progress(0.4, desc="Generating response...")
382
-
383
- # Generate response
384
  raw_response = self.model_manager.generate_response(prompt)
385
 
386
  progress(0.8, desc="Extracting final answer...")
387
-
388
- # Extract final answer and reasoning
389
  final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
390
 
391
  processing_time = time.time() - start_time
392
-
393
  progress(1.0, desc="Complete!")
394
 
395
  return final_answer, raw_response, reasoning, processing_time
@@ -415,17 +409,11 @@ class GAIASpaceAgent:
415
  desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
416
 
417
  start_time = time.time()
418
-
419
- # Create prompt and generate response
420
  prompt = self.prompt_manager.create_gaia_prompt(question.question)
421
  raw_response = self.model_manager.generate_response(prompt)
422
-
423
- # Extract final answer
424
  final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
425
-
426
  processing_time = time.time() - start_time
427
 
428
- # Create response object
429
  response = GAIAResponse(
430
  task_id=question.task_id,
431
  model_answer=raw_response,
@@ -449,13 +437,8 @@ class GAIASpaceAgent:
449
  results.append(error_response)
450
  self.evaluation_results.append(error_response)
451
 
452
- # Generate summary
453
  summary = self._generate_summary(results)
454
-
455
- # Generate detailed results
456
  detailed_results = self._generate_detailed_results(results, questions)
457
-
458
- # Generate downloadable JSONL
459
  jsonl_content = self._generate_jsonl(results)
460
 
461
  return summary, detailed_results, jsonl_content
@@ -530,7 +513,6 @@ class GAIASpaceAgent:
530
  # GLOBAL AGENT INSTANCE
531
  # ================================
532
 
533
- # Initialize global agent
534
  gaia_agent = GAIASpaceAgent()
535
 
536
  # ================================
@@ -562,20 +544,17 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
562
 
563
  progress(0.1, desc="Loading dataset...")
564
 
565
- # Load questions based on choice
566
  if dataset_choice == "Sample Questions":
567
  questions = GAIADatasetManager.get_sample_questions()
568
  status_msg = f"✅ Loaded {len(questions)} sample questions"
569
  else:
570
  questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
571
 
572
- # Limit questions
573
  if max_questions and len(questions) > max_questions:
574
  questions = questions[:max_questions]
575
 
576
  progress(0.2, desc=f"{status_msg}. Starting evaluation...")
577
 
578
- # Run evaluation
579
  summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
580
 
581
  return summary, detailed, jsonl
@@ -602,26 +581,11 @@ def create_gaia_app():
602
 
603
  with gr.Blocks(
604
  title="GAIA Benchmark AI Agent",
605
- theme=gr.themes.Soft(),
606
- css="""
607
- .gradio-container {
608
- font-family: 'Arial', sans-serif;
609
- }
610
- .main-header {
611
- text-align: center;
612
- background: linear-gradient(45deg, #2196F3, #21CBF3);
613
- -webkit-background-clip: text;
614
- -webkit-text-fill-color: transparent;
615
- font-size: 2.5em;
616
- font-weight: bold;
617
- margin-bottom: 20px;
618
- }
619
- """
620
  ) as app:
621
 
622
- # Header
623
  gr.HTML("""
624
- <div class="main-header">
625
  🧠 GAIA Benchmark AI Agent
626
  </div>
627
  <p style="text-align: center; font-size: 1.2em; color: #666;">
@@ -631,9 +595,7 @@ def create_gaia_app():
631
 
632
  with gr.Tabs():
633
 
634
- # ===============================
635
  # TAB 1: MODEL SETUP
636
- # ===============================
637
  with gr.Tab("🔧 Model Setup"):
638
  gr.Markdown("## Choose and Load Your Model")
639
 
@@ -642,8 +604,7 @@ def create_gaia_app():
642
  model_dropdown = gr.Dropdown(
643
  choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
644
  value="Fast & Light",
645
- label="Select Model",
646
- info="Choose based on your quality vs speed preference"
647
  )
648
 
649
  model_info = gr.Markdown(
@@ -655,7 +616,7 @@ def create_gaia_app():
655
 
656
  with gr.Column(scale=1):
657
  gpu_info = gr.Markdown(f"""
658
- ### 🖥️ System Info
659
  **CUDA Available**: {torch.cuda.is_available()}
660
  {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
661
  """)
@@ -666,23 +627,19 @@ def create_gaia_app():
666
  interactive=False
667
  )
668
 
669
- # Update model info when selection changes
670
  model_dropdown.change(
671
  fn=get_model_info,
672
  inputs=[model_dropdown],
673
  outputs=[model_info]
674
  )
675
 
676
- # Load model when button clicked
677
  load_btn.click(
678
  fn=load_model_interface,
679
  inputs=[model_dropdown],
680
  outputs=[model_status]
681
  )
682
 
683
- # ===============================
684
  # TAB 2: SINGLE QUESTION
685
- # ===============================
686
  with gr.Tab("❓ Single Question"):
687
  gr.Markdown("## Test Individual Questions")
688
 
@@ -696,8 +653,7 @@ def create_gaia_app():
696
 
697
  process_btn = gr.Button("🤔 Process Question", variant="primary")
698
 
699
- # Example questions
700
- gr.Markdown("### 💡 Example Questions:")
701
  example_questions = [
702
  "What is the capital of France?",
703
  "Calculate 144 divided by 12",
@@ -705,11 +661,8 @@ def create_gaia_app():
705
  "Convert 100 degrees Celsius to Fahrenheit"
706
  ]
707
 
708
- for i, example in enumerate(example_questions):
709
- gr.Button(
710
- f"📝 {example}",
711
- size="sm"
712
- ).click(
713
  lambda x=example: x,
714
  outputs=[question_input]
715
  )
@@ -739,16 +692,13 @@ def create_gaia_app():
739
  interactive=False
740
  )
741
 
742
- # Process single question
743
  process_btn.click(
744
  fn=single_question_interface,
745
  inputs=[question_input],
746
  outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
747
  )
748
 
749
- # ===============================
750
  # TAB 3: BATCH EVALUATION
751
- # ===============================
752
  with gr.Tab("📊 Batch Evaluation"):
753
  gr.Markdown("## Evaluate Multiple Questions")
754
 
@@ -756,8 +706,7 @@ def create_gaia_app():
756
  dataset_choice = gr.Radio(
757
  choices=["Sample Questions", "GAIA Test Set"],
758
  value="Sample Questions",
759
- label="Dataset Choice",
760
- info="Start with sample questions to test your setup"
761
  )
762
 
763
  max_questions = gr.Slider(
@@ -765,8 +714,7 @@ def create_gaia_app():
765
  maximum=50,
766
  value=5,
767
  step=1,
768
- label="Max Questions",
769
- info="Number of questions to evaluate"
770
  )
771
 
772
  evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
@@ -789,11 +737,9 @@ def create_gaia_app():
789
  value="Run an evaluation to see detailed results"
790
  )
791
 
792
- # Batch evaluation
793
  def batch_eval_with_download(*args):
794
  summary, detailed, jsonl_content = batch_evaluate_interface(*args)
795
 
796
- # Save JSONL for download
797
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
798
  filename = f"gaia_results_{timestamp}.jsonl"
799
 
@@ -811,47 +757,29 @@ def create_gaia_app():
811
  outputs=[download_output]
812
  )
813
 
814
- # ===============================
815
- # TAB 4: FULL BENCHMARK (ENHANCED FOR 300 QUESTIONS)
816
- # ===============================
817
  with gr.Tab("🏆 Full Benchmark"):
818
  gr.Markdown("## Official GAIA Leaderboard Benchmark")
819
 
820
  with gr.Row():
821
  with gr.Column():
822
- gr.Markdown(get_leaderboard_info())
823
-
824
- with gr.Column():
825
- # Test questions preview
826
  test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
827
  test_preview_output = gr.Markdown(
828
  value="Click above to preview official test questions"
829
  )
830
 
831
- # Dataset structure preview (NEW)
832
  dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
833
  dataset_structure_output = gr.Markdown(
834
  value="Click above to see actual GAIA dataset structure"
835
  )
836
-
837
- # Quick benchmark options
838
- gr.Markdown("### 🎯 Quick Benchmark Options")
839
-
840
- with gr.Row():
841
- # Preset buttons for common configurations
842
- quick_test_btn = gr.Button("🚀 Quick Test (20 questions)", variant="secondary")
843
- medium_test_btn = gr.Button("📊 Medium Test (50 questions)", variant="secondary")
844
- full_benchmark_btn = gr.Button("🏆 FULL BENCHMARK (300 questions)", variant="primary", size="lg")
845
-
846
- # Advanced configuration (collapsible)
847
- with gr.Accordion("🎛️ Advanced Configuration", open=False):
848
- with gr.Row():
849
- custom_count = gr.Slider(
850
  minimum=10,
851
- maximum=300,
852
- value=50,
853
  step=10,
854
- label="Custom Question Count"
855
  )
856
 
857
  selection_strategy = gr.Dropdown(
@@ -859,22 +787,9 @@ def create_gaia_app():
859
  value="balanced",
860
  label="Selection Strategy"
861
  )
862
-
863
- custom_benchmark_btn = gr.Button("🎯 Run Custom Benchmark", variant="secondary")
864
-
865
- # Show selection info
866
- selection_info = gr.Markdown(get_question_selection_info())
867
-
868
- # Warning message for full benchmark
869
- gr.Markdown("""
870
- **⚠️ Full 300-Question Benchmark Warning**:
871
- - **Time**: 1-3 hours depending on model and hardware
872
- - **Cost**: ~$1-3 on GPU (T4 Small recommended)
873
- - **Purpose**: Official leaderboard submission
874
- - **Recommendation**: Test with smaller batches first
875
- """)
876
 
877
- # Results section
878
  benchmark_status = gr.Textbox(
879
  label="📊 Benchmark Status",
880
  value="Ready to run benchmark",
@@ -889,7 +804,6 @@ def create_gaia_app():
889
  )
890
 
891
  with gr.Column():
892
- # Download files
893
  submission_file = gr.File(
894
  label="💾 Download Submission File (JSONL)",
895
  visible=False
@@ -899,20 +813,8 @@ def create_gaia_app():
899
  label="📋 Download Metadata File",
900
  visible=False
901
  )
902
-
903
- gr.Markdown("""
904
- ### 📤 Leaderboard Submission Steps
905
- 1. **Download** the JSONL file above
906
- 2. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
907
- 3. **Upload** your submission file
908
- 4. **View** your model's ranking!
909
- """)
910
 
911
- # ================================
912
- # EVENT HANDLERS (FIXED FUNCTION CALLS)
913
- # ================================
914
-
915
- # Preview functions
916
  test_preview_btn.click(
917
  fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
918
  outputs=[test_preview_output]
@@ -923,240 +825,29 @@ def create_gaia_app():
923
  outputs=[dataset_structure_output]
924
  )
925
 
926
- # Quick benchmark functions
927
- def run_quick_test(progress=gr.Progress()):
928
- return run_custom_benchmark_interface(20, "balanced", progress)
929
-
930
- def run_medium_test(progress=gr.Progress()):
931
- return run_custom_benchmark_interface(50, "balanced", progress)
932
-
933
- def run_full_300_benchmark(progress=gr.Progress()):
934
- return run_custom_benchmark_interface(300, "balanced", progress)
935
-
936
- def run_custom_benchmark_wrapper(count, strategy, progress=gr.Progress()):
937
  return run_custom_benchmark_interface(count, strategy, progress)
938
 
939
- # Helper function to show download files
940
  def show_download_files(status, report, sub_file, meta_file):
941
  return (
942
  status,
943
  report,
944
  sub_file,
945
  meta_file,
946
- gr.update(visible=True), # Show submission file
947
- gr.update(visible=True) # Show metadata file
948
  )
949
 
950
- # Quick test events
951
- quick_test_btn.click(
952
- fn=run_quick_test,
953
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
954
  ).then(
955
  fn=show_download_files,
956
  inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
957
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
958
  )
959
-
960
- medium_test_btn.click(
961
- fn=run_medium_test,
962
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
963
- ).then(
964
- fn=show_download_files,
965
- inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
966
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
967
- )
968
-
969
- # FULL 300-question benchmark
970
- full_benchmark_btn.click(
971
- fn=run_full_300_benchmark,
972
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
973
- ).then(
974
- fn=show_download_files,
975
- inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
976
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
977
- )
978
-
979
- # Custom benchmark
980
- custom_benchmark_btn.click(
981
- fn=run_custom_benchmark_wrapper,
982
- inputs=[custom_count, selection_strategy],
983
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
984
- ).then(
985
- fn=show_download_files,
986
- inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
987
- outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
988
- )
989
-
990
- # ===============================
991
- # TAB 5: INFORMATION (UPDATED)
992
- # ===============================
993
- with gr.Tab("ℹ️ Information"):
994
- gr.Markdown("""
995
- # 🧠 GAIA Benchmark AI Agent
996
-
997
- ## What is GAIA?
998
- GAIA (General AI Assistant) is a benchmark designed to test AI assistants on real-world questions that require:
999
- - **Reasoning**: Multi-step logical thinking
1000
- - **Multi-modality**: Handling text, images, and other file types
1001
- - **Web browsing**: Finding and using external information
1002
- - **Tool use**: Calculator, code execution, etc.
1003
-
1004
- ## 🏆 GAIA Public Leaderboard
1005
- GAIA provides a **public leaderboard hosted on Hugging Face** where you can:
1006
- - Test your models against **300 official testing questions**
1007
- - Compare performance with state-of-the-art systems
1008
- - Track progress in AI reasoning capabilities
1009
- - Contribute to research community benchmarks
1010
-
1011
- **Leaderboard URL**: [https://huggingface.co/spaces/gaia-benchmark/leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
1012
-
1013
- ## 🎯 How to Use This Space
1014
-
1015
- ### 1. Model Setup
1016
- - Choose a model based on your needs (speed vs quality)
1017
- - Load the model (this may take a few minutes)
1018
- - Wait for "Model loaded successfully" message
1019
-
1020
- ### 2. Test Single Questions
1021
- - Start with the "Single Question" tab
1022
- - Try example questions to verify everything works
1023
- - Enter your own questions to test model capabilities
1024
-
1025
- ### 3. Batch Evaluation
1026
- - Use "Sample Questions" first to test your setup
1027
- - Then try "GAIA Test Set" for real benchmark evaluation
1028
- - Download results in JSONL format for submission
1029
-
1030
- ### 4. Full Benchmark (Enhanced!)
1031
- - **Quick Tests**: 20 or 50 questions for rapid iteration
1032
- - **Custom Configuration**: Choose exact question count and strategy
1033
- - **Full 300-Question Benchmark**: Complete official evaluation
1034
- - **Leaderboard Ready**: Automatic JSONL generation for submission
1035
-
1036
- ## 📊 Model Recommendations
1037
-
1038
- | Model | Best For | Memory | Speed | Quality | 300Q Time | Cost (T4) |
1039
- |-------|----------|---------|-------|---------|-----------|-----------|
1040
- | Fast & Light | Quick testing | Low | Fast | Good | 45-75 min | ~$0.60-1.00 |
1041
- | Balanced | General use | Medium | Medium | Better | 60-120 min | ~$1.00-2.00 |
1042
- | High Quality | Best results | High | Slow | Best | 90-180 min | ~$1.50-3.00 |
1043
- | Instruction Following | Complex reasoning | High | Medium | Excellent | 75-150 min | ~$1.25-2.50 |
1044
-
1045
- ## 🏅 Benchmark Performance Expectations
1046
-
1047
- Based on current leaderboard standings, expect these performance ranges:
1048
-
1049
- | Difficulty Level | Top Models | Good Models | Baseline Models |
1050
- |------------------|------------|-------------|-----------------|
1051
- | **Level 1** (Basic) | 85-95% | 70-85% | 50-70% |
1052
- | **Level 2** (Intermediate) | 65-80% | 45-65% | 25-45% |
1053
- | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
1054
- | **Overall Average** | 65-75% | 45-65% | 30-45% |
1055
-
1056
- ## 🚀 Flexible Benchmarking Features
1057
-
1058
- ### 🎯 **Custom Question Selection**
1059
- - **Question Count**: Choose 10-300 questions
1060
- - **Selection Strategies**: Balanced, Random, Sequential
1061
- - **Level Distribution**: Automatic balancing across difficulties
1062
- - **Reproducible**: Consistent results with same settings
1063
-
1064
- ### 📊 **Smart Sampling**
1065
- - **Balanced**: Realistic distribution (40% L1, 35% L2, 25% L3)
1066
- - **Representative**: Questions from all difficulty levels
1067
- - **Efficient**: Test fewer questions while maintaining quality
1068
-
1069
- ### ⚡ **Quick Options**
1070
- - **Quick Test (20Q)**: 5-15 minutes, ~$0.10-0.25
1071
- - **Medium Test (50Q)**: 15-30 minutes, ~$0.25-0.50
1072
- - **Full Benchmark (300Q)**: 1-3 hours, ~$1-3
1073
-
1074
- ## 🔄 Continuous Benchmarking Workflow
1075
-
1076
- 1. **Development**: Start with Quick Test (20 questions)
1077
- 2. **Validation**: Use Medium Test (50 questions) for validation
1078
- 3. **Optimization**: Iterate on model improvements
1079
- 4. **Benchmarking**: Run Full Benchmark (300 questions) when ready
1080
- 5. **Submission**: Upload to official GAIA leaderboard
1081
- 6. **Analysis**: Compare with other models and iterate
1082
-
1083
- ## 📋 Official Dataset Integration
1084
-
1085
- ### **Metadata.jsonl Structure**
1086
- - **Questions**: Stored in `2023/validation/metadata.jsonl` and `2023/test/metadata.jsonl`
1087
- - **Additional Files**: Some questions reference images, documents, or data files
1088
- - **Format**: Each line contains one question in JSON format
1089
- - **Fields**: `task_id`, `Question`, `Level`, `file_name` (optional), `Final answer` (validation only)
1090
-
1091
- ### **Submission Format**
1092
- Results are saved in official GAIA leaderboard format:
1093
- ```json
1094
- {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
1095
- {"task_id": "gaia_002", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
1096
- ```
1097
-
1098
- ## ⚡ Pro Tips for Best Results
1099
-
1100
- ### **Performance Optimization**
1101
- 1. **Start Small**: Always test with Quick Test first
1102
- 2. **Choose Wisely**: Balance speed vs quality based on your goals
1103
- 3. **Monitor Resources**: Use GPU acceleration for larger models
1104
- 4. **Validate Format**: Ensure JSONL files are properly formatted
1105
-
1106
- ### **Leaderboard Strategy**
1107
- 1. **Baseline First**: Get initial results with Quick Test
1108
- 2. **Iterate Quickly**: Test improvements on Medium Test
1109
- 3. **Full Benchmark**: Run complete evaluation when ready
1110
- 4. **Compare Results**: Analyze performance across difficulty levels
1111
- 5. **Document Approach**: Include model details and methodology
1112
-
1113
- ### **Cost Management**
1114
- - **Development**: Use Quick Test (20Q) for rapid iteration (~$0.10-0.25)
1115
- - **Validation**: Use Medium Test (50Q) for validation (~$0.25-0.50)
1116
- - **Production**: Use Full Benchmark (300Q) for final submission (~$1-3)
1117
- - **Hardware**: T4 Small GPU recommended for best price/performance
1118
-
1119
- ### **Common Pitfalls to Avoid**
1120
- - Don't run full benchmark on untested models
1121
- - Ensure stable internet connection for long evaluations
1122
- - Verify submission file format before uploading
1123
- - Check GPU memory usage for large models
1124
- - Save intermediate results during long runs
1125
-
1126
- ## 🎯 Getting Started Checklist
1127
-
1128
- - [ ] **Load Model**: Choose and load a model in "Model Setup"
1129
- - [ ] **Test Single**: Try example questions in "Single Question"
1130
- - [ ] **Quick Test**: Run 20-question benchmark to verify setup
1131
- - [ ] **Preview Dataset**: Check "Preview Test Questions" in Full Benchmark
1132
- - [ ] **Medium Test**: Run 50-question validation benchmark
1133
- - [ ] **Full Benchmark**: Run complete 300-question evaluation when ready
1134
- - [ ] **Download Files**: Get JSONL submission and metadata files
1135
- - [ ] **Submit**: Upload to GAIA leaderboard
1136
- - [ ] **Compare**: Analyze your results against other models!
1137
-
1138
- ## 🔗 Resources
1139
- - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
1140
- - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
1141
- - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Official dataset repository
1142
- - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
1143
-
1144
- ---
1145
-
1146
- **Ready to start benchmarking?** Begin with the Model Setup tab, then progress through Quick Test → Medium Test → Full Benchmark. Good luck climbing the leaderboard! 🚀
1147
- """)
1148
 
1149
  return app
1150
 
1151
- # ================================
1152
- # MAIN APPLICATION
1153
- # ================================
1154
-
1155
- if __name__ == "__main__":
1156
- # Create and launch the Gradio app
1157
- app = create_gaia_app()
1158
- app.launch(
1159
- server_name="0.0.0.0",
1160
- server_port=7860,
1161
- share=False
1162
- )
 
30
  from datasets import load_dataset
31
  from huggingface_hub import HfApi, hf_hub_download
32
 
33
+ # Import leaderboard integration
34
  from gaia_leaderboard_integration import (
35
  enhanced_gaia_agent,
36
+ run_custom_benchmark_interface,
37
  load_test_questions_interface,
38
+ preview_dataset_structure_interface,
39
  get_leaderboard_info,
40
+ get_question_selection_info
41
  )
42
 
43
  # Setup logging
 
45
  logger = logging.getLogger(__name__)
46
 
47
  # ================================
48
+ # MAIN APPLICATION
49
+ # ================================
50
+
51
+ if __name__ == "__main__":
52
+ app = create_gaia_app()
53
+ app.launch(
54
+ server_name="0.0.0.0",
55
+ server_port=7860,
56
+ share=False
57
+ )
58
  # CORE DATA STRUCTURES
59
  # ================================
60
 
 
124
  class HFSpaceModelManager:
125
  """Hugging Face Spaces optimized model manager"""
126
 
 
127
  SPACE_MODELS = {
128
  "Fast & Light": {
129
  "name": "microsoft/DialoGPT-medium",
 
169
  if progress_callback:
170
  progress_callback(0.1, "Loading tokenizer...")
171
 
 
172
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
173
  if self.tokenizer.pad_token is None:
174
  self.tokenizer.pad_token = self.tokenizer.eos_token
 
176
  if progress_callback:
177
  progress_callback(0.3, "Configuring model...")
178
 
 
179
  quantization_config = None
180
  if self.device == "cuda" and "7b" in self.model_name.lower():
181
  quantization_config = BitsAndBytesConfig(
 
188
  if progress_callback:
189
  progress_callback(0.6, "Loading model weights...")
190
 
 
191
  self.model = AutoModelForCausalLM.from_pretrained(
192
  self.model_name,
193
  quantization_config=quantization_config,
 
199
  if progress_callback:
200
  progress_callback(0.9, "Creating pipeline...")
201
 
 
202
  self.pipeline = pipeline(
203
  "text-generation",
204
  model=self.model,
 
226
  return "❌ Model not loaded. Please load a model first."
227
 
228
  try:
 
229
  max_input_length = 1000
230
  if len(prompt) > max_input_length:
231
  prompt = prompt[:max_input_length] + "..."
 
355
  self.model_manager = HFSpaceModelManager(model_choice)
356
  self.current_model = model_choice
357
 
 
358
  def progress_callback(value, desc):
359
  progress(value, desc=desc)
360
 
361
  result = self.model_manager.load_model(progress_callback)
 
 
362
  self.evaluation_results = []
363
 
364
  return result
 
375
 
376
  try:
377
  progress(0.2, desc="Creating GAIA prompt...")
 
 
378
  prompt = self.prompt_manager.create_gaia_prompt(question_text)
379
 
380
  progress(0.4, desc="Generating response...")
 
 
381
  raw_response = self.model_manager.generate_response(prompt)
382
 
383
  progress(0.8, desc="Extracting final answer...")
 
 
384
  final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
385
 
386
  processing_time = time.time() - start_time
 
387
  progress(1.0, desc="Complete!")
388
 
389
  return final_answer, raw_response, reasoning, processing_time
 
409
  desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
410
 
411
  start_time = time.time()
 
 
412
  prompt = self.prompt_manager.create_gaia_prompt(question.question)
413
  raw_response = self.model_manager.generate_response(prompt)
 
 
414
  final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
 
415
  processing_time = time.time() - start_time
416
 
 
417
  response = GAIAResponse(
418
  task_id=question.task_id,
419
  model_answer=raw_response,
 
437
  results.append(error_response)
438
  self.evaluation_results.append(error_response)
439
 
 
440
  summary = self._generate_summary(results)
 
 
441
  detailed_results = self._generate_detailed_results(results, questions)
 
 
442
  jsonl_content = self._generate_jsonl(results)
443
 
444
  return summary, detailed_results, jsonl_content
 
513
  # GLOBAL AGENT INSTANCE
514
  # ================================
515
 
 
516
  gaia_agent = GAIASpaceAgent()
517
 
518
  # ================================
 
544
 
545
  progress(0.1, desc="Loading dataset...")
546
 
 
547
  if dataset_choice == "Sample Questions":
548
  questions = GAIADatasetManager.get_sample_questions()
549
  status_msg = f"✅ Loaded {len(questions)} sample questions"
550
  else:
551
  questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
552
 
 
553
  if max_questions and len(questions) > max_questions:
554
  questions = questions[:max_questions]
555
 
556
  progress(0.2, desc=f"{status_msg}. Starting evaluation...")
557
 
 
558
  summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
559
 
560
  return summary, detailed, jsonl
 
581
 
582
  with gr.Blocks(
583
  title="GAIA Benchmark AI Agent",
584
+ theme=gr.themes.Soft()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  ) as app:
586
 
 
587
  gr.HTML("""
588
+ <div style="text-align: center; font-size: 2.5em; font-weight: bold; margin-bottom: 20px;">
589
  🧠 GAIA Benchmark AI Agent
590
  </div>
591
  <p style="text-align: center; font-size: 1.2em; color: #666;">
 
595
 
596
  with gr.Tabs():
597
 
 
598
  # TAB 1: MODEL SETUP
 
599
  with gr.Tab("🔧 Model Setup"):
600
  gr.Markdown("## Choose and Load Your Model")
601
 
 
604
  model_dropdown = gr.Dropdown(
605
  choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
606
  value="Fast & Light",
607
+ label="Select Model"
 
608
  )
609
 
610
  model_info = gr.Markdown(
 
616
 
617
  with gr.Column(scale=1):
618
  gpu_info = gr.Markdown(f"""
619
+ ### System Info
620
  **CUDA Available**: {torch.cuda.is_available()}
621
  {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
622
  """)
 
627
  interactive=False
628
  )
629
 
 
630
  model_dropdown.change(
631
  fn=get_model_info,
632
  inputs=[model_dropdown],
633
  outputs=[model_info]
634
  )
635
 
 
636
  load_btn.click(
637
  fn=load_model_interface,
638
  inputs=[model_dropdown],
639
  outputs=[model_status]
640
  )
641
 
 
642
  # TAB 2: SINGLE QUESTION
 
643
  with gr.Tab("❓ Single Question"):
644
  gr.Markdown("## Test Individual Questions")
645
 
 
653
 
654
  process_btn = gr.Button("🤔 Process Question", variant="primary")
655
 
656
+ gr.Markdown("### Example Questions:")
 
657
  example_questions = [
658
  "What is the capital of France?",
659
  "Calculate 144 divided by 12",
 
661
  "Convert 100 degrees Celsius to Fahrenheit"
662
  ]
663
 
664
+ for example in example_questions:
665
+ gr.Button(f"📝 {example}", size="sm").click(
 
 
 
666
  lambda x=example: x,
667
  outputs=[question_input]
668
  )
 
692
  interactive=False
693
  )
694
 
 
695
  process_btn.click(
696
  fn=single_question_interface,
697
  inputs=[question_input],
698
  outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
699
  )
700
 
 
701
  # TAB 3: BATCH EVALUATION
 
702
  with gr.Tab("📊 Batch Evaluation"):
703
  gr.Markdown("## Evaluate Multiple Questions")
704
 
 
706
  dataset_choice = gr.Radio(
707
  choices=["Sample Questions", "GAIA Test Set"],
708
  value="Sample Questions",
709
+ label="Dataset Choice"
 
710
  )
711
 
712
  max_questions = gr.Slider(
 
714
  maximum=50,
715
  value=5,
716
  step=1,
717
+ label="Max Questions"
 
718
  )
719
 
720
  evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
 
737
  value="Run an evaluation to see detailed results"
738
  )
739
 
 
740
  def batch_eval_with_download(*args):
741
  summary, detailed, jsonl_content = batch_evaluate_interface(*args)
742
 
 
743
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
744
  filename = f"gaia_results_{timestamp}.jsonl"
745
 
 
757
  outputs=[download_output]
758
  )
759
 
760
+ # TAB 4: FULL BENCHMARK
 
 
761
  with gr.Tab("🏆 Full Benchmark"):
762
  gr.Markdown("## Official GAIA Leaderboard Benchmark")
763
 
764
  with gr.Row():
765
  with gr.Column():
 
 
 
 
766
  test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
767
  test_preview_output = gr.Markdown(
768
  value="Click above to preview official test questions"
769
  )
770
 
 
771
  dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
772
  dataset_structure_output = gr.Markdown(
773
  value="Click above to see actual GAIA dataset structure"
774
  )
775
+
776
+ with gr.Column():
777
+ question_count = gr.Slider(
 
 
 
 
 
 
 
 
 
 
 
778
  minimum=10,
779
+ maximum=300,
780
+ value=20,
781
  step=10,
782
+ label="Number of Questions"
783
  )
784
 
785
  selection_strategy = gr.Dropdown(
 
787
  value="balanced",
788
  label="Selection Strategy"
789
  )
790
+
791
+ benchmark_btn = gr.Button("🎯 Run Benchmark", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
792
 
 
793
  benchmark_status = gr.Textbox(
794
  label="📊 Benchmark Status",
795
  value="Ready to run benchmark",
 
804
  )
805
 
806
  with gr.Column():
 
807
  submission_file = gr.File(
808
  label="💾 Download Submission File (JSONL)",
809
  visible=False
 
813
  label="📋 Download Metadata File",
814
  visible=False
815
  )
 
 
 
 
 
 
 
 
816
 
817
+ # Event handlers
 
 
 
 
818
  test_preview_btn.click(
819
  fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
820
  outputs=[test_preview_output]
 
825
  outputs=[dataset_structure_output]
826
  )
827
 
828
+ def run_benchmark_wrapper(count, strategy, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
829
  return run_custom_benchmark_interface(count, strategy, progress)
830
 
 
831
  def show_download_files(status, report, sub_file, meta_file):
832
  return (
833
  status,
834
  report,
835
  sub_file,
836
  meta_file,
837
+ gr.update(visible=True),
838
+ gr.update(visible=True)
839
  )
840
 
841
+ benchmark_btn.click(
842
+ fn=run_benchmark_wrapper,
843
+ inputs=[question_count, selection_strategy],
844
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
845
  ).then(
846
  fn=show_download_files,
847
  inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
848
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
849
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
 
851
  return app
852
 
853
+ # ================================