Spaces:

Ashokdll
/

agent_unit4

Running

App Files Files Community

Ashokdll commited on Jun 4

Commit

310a013

verified ·

1 Parent(s): ba63dbb

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -349

app.py CHANGED Viewed

@@ -30,14 +30,14 @@ from transformers import (
 from datasets import load_dataset
 from huggingface_hub import HfApi, hf_hub_download
-# Import leaderboard integration (CORRECTED IMPORTS)
 from gaia_leaderboard_integration import (
     enhanced_gaia_agent,
-    run_custom_benchmark_interface,  # ← FIXED: was run_leaderboard_benchmark_interface
     load_test_questions_interface,
-    preview_dataset_structure_interface,  # ← NEW FUNCTION
     get_leaderboard_info,
-    get_question_selection_info  # ← NEW FUNCTION
 )
 # Setup logging
@@ -45,6 +45,16 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ================================
 # CORE DATA STRUCTURES
 # ================================
@@ -114,7 +124,6 @@ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma sepa
 class HFSpaceModelManager:
     """Hugging Face Spaces optimized model manager"""
-    # Space-friendly models with different capabilities
     SPACE_MODELS = {
         "Fast & Light": {
             "name": "microsoft/DialoGPT-medium",
@@ -160,7 +169,6 @@ class HFSpaceModelManager:
             if progress_callback:
                 progress_callback(0.1, "Loading tokenizer...")
-            # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
@@ -168,7 +176,6 @@ class HFSpaceModelManager:
             if progress_callback:
                 progress_callback(0.3, "Configuring model...")
-            # Configure quantization for GPU spaces
             quantization_config = None
             if self.device == "cuda" and "7b" in self.model_name.lower():
                 quantization_config = BitsAndBytesConfig(
@@ -181,7 +188,6 @@ class HFSpaceModelManager:
             if progress_callback:
                 progress_callback(0.6, "Loading model weights...")
-            # Load model
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 quantization_config=quantization_config,
@@ -193,7 +199,6 @@ class HFSpaceModelManager:
             if progress_callback:
                 progress_callback(0.9, "Creating pipeline...")
-            # Create pipeline
             self.pipeline = pipeline(
                 "text-generation",
                 model=self.model,
@@ -221,7 +226,6 @@ class HFSpaceModelManager:
             return "❌ Model not loaded. Please load a model first."
         try:
-            # Truncate prompt if too long
             max_input_length = 1000
             if len(prompt) > max_input_length:
                 prompt = prompt[:max_input_length] + "..."
@@ -351,13 +355,10 @@ class GAIASpaceAgent:
             self.model_manager = HFSpaceModelManager(model_choice)
             self.current_model = model_choice
-            # Load model with progress updates
             def progress_callback(value, desc):
                 progress(value, desc=desc)
             result = self.model_manager.load_model(progress_callback)
-            # Clear any previous results when changing models
             self.evaluation_results = []
             return result
@@ -374,22 +375,15 @@ class GAIASpaceAgent:
         try:
             progress(0.2, desc="Creating GAIA prompt...")
-            # Create GAIA prompt
             prompt = self.prompt_manager.create_gaia_prompt(question_text)
             progress(0.4, desc="Generating response...")
-            # Generate response
             raw_response = self.model_manager.generate_response(prompt)
             progress(0.8, desc="Extracting final answer...")
-            # Extract final answer and reasoning
             final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
             processing_time = time.time() - start_time
             progress(1.0, desc="Complete!")
             return final_answer, raw_response, reasoning, processing_time
@@ -415,17 +409,11 @@ class GAIASpaceAgent:
                         desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
                 start_time = time.time()
-                # Create prompt and generate response
                 prompt = self.prompt_manager.create_gaia_prompt(question.question)
                 raw_response = self.model_manager.generate_response(prompt)
-                # Extract final answer
                 final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
                 processing_time = time.time() - start_time
-                # Create response object
                 response = GAIAResponse(
                     task_id=question.task_id,
                     model_answer=raw_response,
@@ -449,13 +437,8 @@ class GAIASpaceAgent:
                 results.append(error_response)
                 self.evaluation_results.append(error_response)
-        # Generate summary
         summary = self._generate_summary(results)
-        # Generate detailed results
         detailed_results = self._generate_detailed_results(results, questions)
-        # Generate downloadable JSONL
         jsonl_content = self._generate_jsonl(results)
         return summary, detailed_results, jsonl_content
@@ -530,7 +513,6 @@ class GAIASpaceAgent:
 # GLOBAL AGENT INSTANCE
 # ================================
-# Initialize global agent
 gaia_agent = GAIASpaceAgent()
 # ================================
@@ -562,20 +544,17 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
     progress(0.1, desc="Loading dataset...")
-    # Load questions based on choice
     if dataset_choice == "Sample Questions":
         questions = GAIADatasetManager.get_sample_questions()
         status_msg = f"✅ Loaded {len(questions)} sample questions"
     else:
         questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
-    # Limit questions
     if max_questions and len(questions) > max_questions:
         questions = questions[:max_questions]
     progress(0.2, desc=f"{status_msg}. Starting evaluation...")
-    # Run evaluation
     summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
     return summary, detailed, jsonl
@@ -602,26 +581,11 @@ def create_gaia_app():
     with gr.Blocks(
         title="GAIA Benchmark AI Agent",
-        theme=gr.themes.Soft(),
-        css="""
-        .gradio-container {
-            font-family: 'Arial', sans-serif;
-        }
-        .main-header {
-            text-align: center;
-            background: linear-gradient(45deg, #2196F3, #21CBF3);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-            font-size: 2.5em;
-            font-weight: bold;
-            margin-bottom: 20px;
-        }
-        """
     ) as app:
-        # Header
         gr.HTML("""
-        <div class="main-header">
             🧠 GAIA Benchmark AI Agent
         </div>
         <p style="text-align: center; font-size: 1.2em; color: #666;">
@@ -631,9 +595,7 @@ def create_gaia_app():
         with gr.Tabs():
-            # ===============================
             # TAB 1: MODEL SETUP
-            # ===============================
             with gr.Tab("🔧 Model Setup"):
                 gr.Markdown("## Choose and Load Your Model")
@@ -642,8 +604,7 @@ def create_gaia_app():
                         model_dropdown = gr.Dropdown(
                             choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
                             value="Fast & Light",
-                            label="Select Model",
-                            info="Choose based on your quality vs speed preference"
                         )
                         model_info = gr.Markdown(
@@ -655,7 +616,7 @@ def create_gaia_app():
                     with gr.Column(scale=1):
                         gpu_info = gr.Markdown(f"""
-                        ### 🖥️ System Info
                         **CUDA Available**: {torch.cuda.is_available()}
                         {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
                         """)
@@ -666,23 +627,19 @@ def create_gaia_app():
                     interactive=False
                 )
-                # Update model info when selection changes
                 model_dropdown.change(
                     fn=get_model_info,
                     inputs=[model_dropdown],
                     outputs=[model_info]
                 )
-                # Load model when button clicked
                 load_btn.click(
                     fn=load_model_interface,
                     inputs=[model_dropdown],
                     outputs=[model_status]
                 )
-            # ===============================
             # TAB 2: SINGLE QUESTION
-            # ===============================
             with gr.Tab("❓ Single Question"):
                 gr.Markdown("## Test Individual Questions")
@@ -696,8 +653,7 @@ def create_gaia_app():
                         process_btn = gr.Button("🤔 Process Question", variant="primary")
-                        # Example questions
-                        gr.Markdown("### 💡 Example Questions:")
                         example_questions = [
                             "What is the capital of France?",
                             "Calculate 144 divided by 12",
@@ -705,11 +661,8 @@ def create_gaia_app():
                             "Convert 100 degrees Celsius to Fahrenheit"
                         ]
-                        for i, example in enumerate(example_questions):
-                            gr.Button(
-                                f"📝 {example}",
-                                size="sm"
-                            ).click(
                                 lambda x=example: x,
                                 outputs=[question_input]
                             )
@@ -739,16 +692,13 @@ def create_gaia_app():
                                 interactive=False
                             )
-                # Process single question
                 process_btn.click(
                     fn=single_question_interface,
                     inputs=[question_input],
                     outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
                 )
-            # ===============================
             # TAB 3: BATCH EVALUATION
-            # ===============================
             with gr.Tab("📊 Batch Evaluation"):
                 gr.Markdown("## Evaluate Multiple Questions")
@@ -756,8 +706,7 @@ def create_gaia_app():
                     dataset_choice = gr.Radio(
                         choices=["Sample Questions", "GAIA Test Set"],
                         value="Sample Questions",
-                        label="Dataset Choice",
-                        info="Start with sample questions to test your setup"
                     )
                     max_questions = gr.Slider(
@@ -765,8 +714,7 @@ def create_gaia_app():
                         maximum=50,
                         value=5,
                         step=1,
-                        label="Max Questions",
-                        info="Number of questions to evaluate"
                     )
                 evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
@@ -789,11 +737,9 @@ def create_gaia_app():
                         value="Run an evaluation to see detailed results"
                     )
-                # Batch evaluation
                 def batch_eval_with_download(*args):
                     summary, detailed, jsonl_content = batch_evaluate_interface(*args)
-                    # Save JSONL for download
                     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                     filename = f"gaia_results_{timestamp}.jsonl"
@@ -811,47 +757,29 @@ def create_gaia_app():
                     outputs=[download_output]
                 )
-            # ===============================
-            # TAB 4: FULL BENCHMARK (ENHANCED FOR 300 QUESTIONS)
-            # ===============================
             with gr.Tab("🏆 Full Benchmark"):
                 gr.Markdown("## Official GAIA Leaderboard Benchmark")
                 with gr.Row():
                     with gr.Column():
-                        gr.Markdown(get_leaderboard_info())
-                    with gr.Column():
-                        # Test questions preview
                         test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
                         test_preview_output = gr.Markdown(
                             value="Click above to preview official test questions"
                         )
-                        # Dataset structure preview (NEW)
                         dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
                         dataset_structure_output = gr.Markdown(
                             value="Click above to see actual GAIA dataset structure"
                         )
-                # Quick benchmark options
-                gr.Markdown("### 🎯 Quick Benchmark Options")
-                with gr.Row():
-                    # Preset buttons for common configurations
-                    quick_test_btn = gr.Button("🚀 Quick Test (20 questions)", variant="secondary")
-                    medium_test_btn = gr.Button("📊 Medium Test (50 questions)", variant="secondary")
-                    full_benchmark_btn = gr.Button("🏆 FULL BENCHMARK (300 questions)", variant="primary", size="lg")
-                # Advanced configuration (collapsible)
-                with gr.Accordion("🎛️ Advanced Configuration", open=False):
-                    with gr.Row():
-                        custom_count = gr.Slider(
                             minimum=10,
-                            maximum=300,
-                            value=50,
                             step=10,
-                            label="Custom Question Count"
                         )
                         selection_strategy = gr.Dropdown(
@@ -859,22 +787,9 @@ def create_gaia_app():
                             value="balanced",
                             label="Selection Strategy"
                         )
-                    custom_benchmark_btn = gr.Button("🎯 Run Custom Benchmark", variant="secondary")
-                    # Show selection info
-                    selection_info = gr.Markdown(get_question_selection_info())
-                # Warning message for full benchmark
-                gr.Markdown("""
-                **⚠️ Full 300-Question Benchmark Warning**:
-                - **Time**: 1-3 hours depending on model and hardware
-                - **Cost**: ~$1-3 on GPU (T4 Small recommended)
-                - **Purpose**: Official leaderboard submission
-                - **Recommendation**: Test with smaller batches first
-                """)
-                # Results section
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
                     value="Ready to run benchmark",
@@ -889,7 +804,6 @@ def create_gaia_app():
                         )
                     with gr.Column():
-                        # Download files
                         submission_file = gr.File(
                             label="💾 Download Submission File (JSONL)",
                             visible=False
@@ -899,20 +813,8 @@ def create_gaia_app():
                             label="📋 Download Metadata File",
                             visible=False
                         )
-                        gr.Markdown("""
-                        ### 📤 Leaderboard Submission Steps
-                        1. **Download** the JSONL file above
-                        2. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
-                        3. **Upload** your submission file
-                        4. **View** your model's ranking!
-                        """)
-                # ================================
-                # EVENT HANDLERS (FIXED FUNCTION CALLS)
-                # ================================
-                # Preview functions
                 test_preview_btn.click(
                     fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
                     outputs=[test_preview_output]
@@ -923,240 +825,29 @@ def create_gaia_app():
                     outputs=[dataset_structure_output]
                 )
-                # Quick benchmark functions
-                def run_quick_test(progress=gr.Progress()):
-                    return run_custom_benchmark_interface(20, "balanced", progress)
-                def run_medium_test(progress=gr.Progress()):
-                    return run_custom_benchmark_interface(50, "balanced", progress)
-                def run_full_300_benchmark(progress=gr.Progress()):
-                    return run_custom_benchmark_interface(300, "balanced", progress)
-                def run_custom_benchmark_wrapper(count, strategy, progress=gr.Progress()):
                     return run_custom_benchmark_interface(count, strategy, progress)
-                # Helper function to show download files
                 def show_download_files(status, report, sub_file, meta_file):
                     return (
                         status,
                         report,
                         sub_file,
                         meta_file,
-                        gr.update(visible=True),  # Show submission file
-                        gr.update(visible=True)   # Show metadata file
                     )
-                # Quick test events
-                quick_test_btn.click(
-                    fn=run_quick_test,
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
                 ).then(
                     fn=show_download_files,
                     inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
-                medium_test_btn.click(
-                    fn=run_medium_test,
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
-                ).then(
-                    fn=show_download_files,
-                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
-                )
-                # FULL 300-question benchmark
-                full_benchmark_btn.click(
-                    fn=run_full_300_benchmark,
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
-                ).then(
-                    fn=show_download_files,
-                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
-                )
-                # Custom benchmark
-                custom_benchmark_btn.click(
-                    fn=run_custom_benchmark_wrapper,
-                    inputs=[custom_count, selection_strategy],
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
-                ).then(
-                    fn=show_download_files,
-                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
-                )
-            # ===============================
-            # TAB 5: INFORMATION (UPDATED)
-            # ===============================
-            with gr.Tab("ℹ️ Information"):
-                gr.Markdown("""
-                # 🧠 GAIA Benchmark AI Agent
-                ## What is GAIA?
-                GAIA (General AI Assistant) is a benchmark designed to test AI assistants on real-world questions that require:
-                - **Reasoning**: Multi-step logical thinking
-                - **Multi-modality**: Handling text, images, and other file types
-                - **Web browsing**: Finding and using external information
-                - **Tool use**: Calculator, code execution, etc.
-                ## 🏆 GAIA Public Leaderboard
-                GAIA provides a **public leaderboard hosted on Hugging Face** where you can:
-                - Test your models against **300 official testing questions**
-                - Compare performance with state-of-the-art systems
-                - Track progress in AI reasoning capabilities
-                - Contribute to research community benchmarks
-                **Leaderboard URL**: [https://huggingface.co/spaces/gaia-benchmark/leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
-                ## 🎯 How to Use This Space
-                ### 1. Model Setup
-                - Choose a model based on your needs (speed vs quality)
-                - Load the model (this may take a few minutes)
-                - Wait for "Model loaded successfully" message
-                ### 2. Test Single Questions
-                - Start with the "Single Question" tab
-                - Try example questions to verify everything works
-                - Enter your own questions to test model capabilities
-                ### 3. Batch Evaluation
-                - Use "Sample Questions" first to test your setup
-                - Then try "GAIA Test Set" for real benchmark evaluation
-                - Download results in JSONL format for submission
-                ### 4. Full Benchmark (Enhanced!)
-                - **Quick Tests**: 20 or 50 questions for rapid iteration
-                - **Custom Configuration**: Choose exact question count and strategy
-                - **Full 300-Question Benchmark**: Complete official evaluation
-                - **Leaderboard Ready**: Automatic JSONL generation for submission
-                ## 📊 Model Recommendations
-                | Model | Best For | Memory | Speed | Quality | 300Q Time | Cost (T4) |
-                |-------|----------|---------|-------|---------|-----------|-----------|
-                | Fast & Light | Quick testing | Low | Fast | Good | 45-75 min | ~$0.60-1.00 |
-                | Balanced | General use | Medium | Medium | Better | 60-120 min | ~$1.00-2.00 |
-                | High Quality | Best results | High | Slow | Best | 90-180 min | ~$1.50-3.00 |
-                | Instruction Following | Complex reasoning | High | Medium | Excellent | 75-150 min | ~$1.25-2.50 |
-                ## 🏅 Benchmark Performance Expectations
-                Based on current leaderboard standings, expect these performance ranges:
-                | Difficulty Level | Top Models | Good Models | Baseline Models |
-                |------------------|------------|-------------|-----------------|
-                | **Level 1** (Basic) | 85-95% | 70-85% | 50-70% |
-                | **Level 2** (Intermediate) | 65-80% | 45-65% | 25-45% |
-                | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
-                | **Overall Average** | 65-75% | 45-65% | 30-45% |
-                ## 🚀 Flexible Benchmarking Features
-                ### 🎯 **Custom Question Selection**
-                - **Question Count**: Choose 10-300 questions
-                - **Selection Strategies**: Balanced, Random, Sequential
-                - **Level Distribution**: Automatic balancing across difficulties
-                - **Reproducible**: Consistent results with same settings
-                ### 📊 **Smart Sampling**
-                - **Balanced**: Realistic distribution (40% L1, 35% L2, 25% L3)
-                - **Representative**: Questions from all difficulty levels
-                - **Efficient**: Test fewer questions while maintaining quality
-                ### ⚡ **Quick Options**
-                - **Quick Test (20Q)**: 5-15 minutes, ~$0.10-0.25
-                - **Medium Test (50Q)**: 15-30 minutes, ~$0.25-0.50
-                - **Full Benchmark (300Q)**: 1-3 hours, ~$1-3
-                ## 🔄 Continuous Benchmarking Workflow
-                1. **Development**: Start with Quick Test (20 questions)
-                2. **Validation**: Use Medium Test (50 questions) for validation
-                3. **Optimization**: Iterate on model improvements
-                4. **Benchmarking**: Run Full Benchmark (300 questions) when ready
-                5. **Submission**: Upload to official GAIA leaderboard
-                6. **Analysis**: Compare with other models and iterate
-                ## 📋 Official Dataset Integration
-                ### **Metadata.jsonl Structure**
-                - **Questions**: Stored in `2023/validation/metadata.jsonl` and `2023/test/metadata.jsonl`
-                - **Additional Files**: Some questions reference images, documents, or data files
-                - **Format**: Each line contains one question in JSON format
-                - **Fields**: `task_id`, `Question`, `Level`, `file_name` (optional), `Final answer` (validation only)
-                ### **Submission Format**
-                Results are saved in official GAIA leaderboard format:
-                ```json
-                {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
-                {"task_id": "gaia_002", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
-                ```
-                ## ⚡ Pro Tips for Best Results
-                ### **Performance Optimization**
-                1. **Start Small**: Always test with Quick Test first
-                2. **Choose Wisely**: Balance speed vs quality based on your goals
-                3. **Monitor Resources**: Use GPU acceleration for larger models
-                4. **Validate Format**: Ensure JSONL files are properly formatted
-                ### **Leaderboard Strategy**
-                1. **Baseline First**: Get initial results with Quick Test
-                2. **Iterate Quickly**: Test improvements on Medium Test
-                3. **Full Benchmark**: Run complete evaluation when ready
-                4. **Compare Results**: Analyze performance across difficulty levels
-                5. **Document Approach**: Include model details and methodology
-                ### **Cost Management**
-                - **Development**: Use Quick Test (20Q) for rapid iteration (~$0.10-0.25)
-                - **Validation**: Use Medium Test (50Q) for validation (~$0.25-0.50)
-                - **Production**: Use Full Benchmark (300Q) for final submission (~$1-3)
-                - **Hardware**: T4 Small GPU recommended for best price/performance
-                ### **Common Pitfalls to Avoid**
-                - Don't run full benchmark on untested models
-                - Ensure stable internet connection for long evaluations
-                - Verify submission file format before uploading
-                - Check GPU memory usage for large models
-                - Save intermediate results during long runs
-                ## 🎯 Getting Started Checklist
-                - [ ] **Load Model**: Choose and load a model in "Model Setup"
-                - [ ] **Test Single**: Try example questions in "Single Question"
-                - [ ] **Quick Test**: Run 20-question benchmark to verify setup
-                - [ ] **Preview Dataset**: Check "Preview Test Questions" in Full Benchmark
-                - [ ] **Medium Test**: Run 50-question validation benchmark
-                - [ ] **Full Benchmark**: Run complete 300-question evaluation when ready
-                - [ ] **Download Files**: Get JSONL submission and metadata files
-                - [ ] **Submit**: Upload to GAIA leaderboard
-                - [ ] **Compare**: Analyze your results against other models!
-                ## 🔗 Resources
-                - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
-                - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
-                - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Official dataset repository
-                - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
-                ---
-                **Ready to start benchmarking?** Begin with the Model Setup tab, then progress through Quick Test → Medium Test → Full Benchmark. Good luck climbing the leaderboard! 🚀
-                """)
         return app
-# ================================
-# MAIN APPLICATION
-# ================================
-if __name__ == "__main__":
-    # Create and launch the Gradio app
-    app = create_gaia_app()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 from datasets import load_dataset
 from huggingface_hub import HfApi, hf_hub_download
+# Import leaderboard integration
 from gaia_leaderboard_integration import (
     enhanced_gaia_agent,
+    run_custom_benchmark_interface,
     load_test_questions_interface,
+    preview_dataset_structure_interface,
     get_leaderboard_info,
+    get_question_selection_info
 )
 # Setup logging
 logger = logging.getLogger(__name__)
 # ================================
+# MAIN APPLICATION
+# ================================
+if __name__ == "__main__":
+    app = create_gaia_app()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )
 # CORE DATA STRUCTURES
 # ================================
 class HFSpaceModelManager:
     """Hugging Face Spaces optimized model manager"""
     SPACE_MODELS = {
         "Fast & Light": {
             "name": "microsoft/DialoGPT-medium",
             if progress_callback:
                 progress_callback(0.1, "Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
             if progress_callback:
                 progress_callback(0.3, "Configuring model...")
             quantization_config = None
             if self.device == "cuda" and "7b" in self.model_name.lower():
                 quantization_config = BitsAndBytesConfig(
             if progress_callback:
                 progress_callback(0.6, "Loading model weights...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 quantization_config=quantization_config,
             if progress_callback:
                 progress_callback(0.9, "Creating pipeline...")
             self.pipeline = pipeline(
                 "text-generation",
                 model=self.model,
             return "❌ Model not loaded. Please load a model first."
         try:
             max_input_length = 1000
             if len(prompt) > max_input_length:
                 prompt = prompt[:max_input_length] + "..."
             self.model_manager = HFSpaceModelManager(model_choice)
             self.current_model = model_choice
             def progress_callback(value, desc):
                 progress(value, desc=desc)
             result = self.model_manager.load_model(progress_callback)
             self.evaluation_results = []
             return result
         try:
             progress(0.2, desc="Creating GAIA prompt...")
             prompt = self.prompt_manager.create_gaia_prompt(question_text)
             progress(0.4, desc="Generating response...")
             raw_response = self.model_manager.generate_response(prompt)
             progress(0.8, desc="Extracting final answer...")
             final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
             processing_time = time.time() - start_time
             progress(1.0, desc="Complete!")
             return final_answer, raw_response, reasoning, processing_time
                         desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
                 start_time = time.time()
                 prompt = self.prompt_manager.create_gaia_prompt(question.question)
                 raw_response = self.model_manager.generate_response(prompt)
                 final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
                 processing_time = time.time() - start_time
                 response = GAIAResponse(
                     task_id=question.task_id,
                     model_answer=raw_response,
                 results.append(error_response)
                 self.evaluation_results.append(error_response)
         summary = self._generate_summary(results)
         detailed_results = self._generate_detailed_results(results, questions)
         jsonl_content = self._generate_jsonl(results)
         return summary, detailed_results, jsonl_content
 # GLOBAL AGENT INSTANCE
 # ================================
 gaia_agent = GAIASpaceAgent()
 # ================================
     progress(0.1, desc="Loading dataset...")
     if dataset_choice == "Sample Questions":
         questions = GAIADatasetManager.get_sample_questions()
         status_msg = f"✅ Loaded {len(questions)} sample questions"
     else:
         questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
     if max_questions and len(questions) > max_questions:
         questions = questions[:max_questions]
     progress(0.2, desc=f"{status_msg}. Starting evaluation...")
     summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
     return summary, detailed, jsonl
     with gr.Blocks(
         title="GAIA Benchmark AI Agent",
+        theme=gr.themes.Soft()
     ) as app:
         gr.HTML("""
+        <div style="text-align: center; font-size: 2.5em; font-weight: bold; margin-bottom: 20px;">
             🧠 GAIA Benchmark AI Agent
         </div>
         <p style="text-align: center; font-size: 1.2em; color: #666;">
         with gr.Tabs():
             # TAB 1: MODEL SETUP
             with gr.Tab("🔧 Model Setup"):
                 gr.Markdown("## Choose and Load Your Model")
                         model_dropdown = gr.Dropdown(
                             choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
                             value="Fast & Light",
+                            label="Select Model"
                         )
                         model_info = gr.Markdown(
                     with gr.Column(scale=1):
                         gpu_info = gr.Markdown(f"""
+                        ### System Info
                         **CUDA Available**: {torch.cuda.is_available()}
                         {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
                         """)
                     interactive=False
                 )
                 model_dropdown.change(
                     fn=get_model_info,
                     inputs=[model_dropdown],
                     outputs=[model_info]
                 )
                 load_btn.click(
                     fn=load_model_interface,
                     inputs=[model_dropdown],
                     outputs=[model_status]
                 )
             # TAB 2: SINGLE QUESTION
             with gr.Tab("❓ Single Question"):
                 gr.Markdown("## Test Individual Questions")
                         process_btn = gr.Button("🤔 Process Question", variant="primary")
+                        gr.Markdown("### Example Questions:")
                         example_questions = [
                             "What is the capital of France?",
                             "Calculate 144 divided by 12",
                             "Convert 100 degrees Celsius to Fahrenheit"
                         ]
+                        for example in example_questions:
+                            gr.Button(f"📝 {example}", size="sm").click(
                                 lambda x=example: x,
                                 outputs=[question_input]
                             )
                                 interactive=False
                             )
                 process_btn.click(
                     fn=single_question_interface,
                     inputs=[question_input],
                     outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
                 )
             # TAB 3: BATCH EVALUATION
             with gr.Tab("📊 Batch Evaluation"):
                 gr.Markdown("## Evaluate Multiple Questions")
                     dataset_choice = gr.Radio(
                         choices=["Sample Questions", "GAIA Test Set"],
                         value="Sample Questions",
+                        label="Dataset Choice"
                     )
                     max_questions = gr.Slider(
                         maximum=50,
                         value=5,
                         step=1,
+                        label="Max Questions"
                     )
                 evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
                         value="Run an evaluation to see detailed results"
                     )
                 def batch_eval_with_download(*args):
                     summary, detailed, jsonl_content = batch_evaluate_interface(*args)
                     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                     filename = f"gaia_results_{timestamp}.jsonl"
                     outputs=[download_output]
                 )
+            # TAB 4: FULL BENCHMARK
             with gr.Tab("🏆 Full Benchmark"):
                 gr.Markdown("## Official GAIA Leaderboard Benchmark")
                 with gr.Row():
                     with gr.Column():
                         test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
                         test_preview_output = gr.Markdown(
                             value="Click above to preview official test questions"
                         )
                         dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
                         dataset_structure_output = gr.Markdown(
                             value="Click above to see actual GAIA dataset structure"
                         )
+                    with gr.Column():
+                        question_count = gr.Slider(
                             minimum=10,
+                            maximum=300,
+                            value=20,
                             step=10,
+                            label="Number of Questions"
                         )
                         selection_strategy = gr.Dropdown(
                             value="balanced",
                             label="Selection Strategy"
                         )
+                        benchmark_btn = gr.Button("🎯 Run Benchmark", variant="primary", size="lg")
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
                     value="Ready to run benchmark",
                         )
                     with gr.Column():
                         submission_file = gr.File(
                             label="💾 Download Submission File (JSONL)",
                             visible=False
                             label="📋 Download Metadata File",
                             visible=False
                         )
+                # Event handlers
                 test_preview_btn.click(
                     fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
                     outputs=[test_preview_output]
                     outputs=[dataset_structure_output]
                 )
+                def run_benchmark_wrapper(count, strategy, progress=gr.Progress()):
                     return run_custom_benchmark_interface(count, strategy, progress)
                 def show_download_files(status, report, sub_file, meta_file):
                     return (
                         status,
                         report,
                         sub_file,
                         meta_file,
+                        gr.update(visible=True),
+                        gr.update(visible=True)
                     )
+                benchmark_btn.click(
+                    fn=run_benchmark_wrapper,
+                    inputs=[question_count, selection_strategy],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
                 ).then(
                     fn=show_download_files,
                     inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
         return app
+# ================================