Spaces:

Ashokdll
/

agent_unit4

Running

App Files Files Community

Ashokdll commited on Jun 4

Commit

770a217

verified ·

1 Parent(s): 2bc7f54

Update app.py

Browse files

Files changed (1) hide show

app.py +694 -87

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 """
-GAIA Benchmark AI Agent - Hugging Face Space
-============================================
 A Gradio-based web interface for running GAIA benchmark evaluations
-on Hugging Face Spaces with GPU acceleration.
 """
 import gradio as gr
@@ -27,22 +27,54 @@ from transformers import (
     pipeline
 )
 from datasets import load_dataset
-from huggingface_hub import HfApi, hf_hub_download
-# Import leaderboard integration
-from gaia_leaderboard_integration import (
-    enhanced_gaia_agent,
-    run_custom_benchmark_interface,
-    load_test_questions_interface,
-    preview_dataset_structure_interface,
-    get_leaderboard_info,
-    get_question_selection_info
-)
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ================================
 # CORE DATA STRUCTURES
 # ================================
@@ -79,7 +111,9 @@ class GAIAPromptManager:
     """Manages GAIA-specific prompting and formatting"""
     GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
 FINAL ANSWER: [YOUR FINAL ANSWER]
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
     @staticmethod
@@ -233,42 +267,137 @@ class HFSpaceModelManager:
             return f"❌ Error generating response: {str(e)}"
 # ================================
-# DATASET MANAGEMENT
 # ================================
 class GAIADatasetManager:
-    """Manages GAIA dataset loading and sample generation"""
     @staticmethod
-    def load_gaia_dataset(split: str = "test", max_questions: int = None) -> Tuple[List[GAIAQuestion], str]:
-        """Load GAIA dataset from Hugging Face Hub"""
         try:
-            dataset = load_dataset("gaia-benchmark/GAIA", split=split, trust_remote_code=True)
             questions = []
             items = dataset[:max_questions] if max_questions else dataset
             for i, item in enumerate(items):
                 question = GAIAQuestion(
-                    task_id=item.get('task_id', f'gaia_{split}_{i:03d}'),
-                    question=item['Question'],
-                    level=item['Level'],
-                    final_answer=item.get('Final answer', None),
-                    file_name=item.get('file_name', None),
-                    annotator_metadata=item.get('Annotator Metadata', None)
                 )
                 questions.append(question)
             status = f"✅ Loaded {len(questions)} questions from GAIA {split} split"
             return questions, status
         except Exception as e:
             error_msg = f"❌ Error loading GAIA dataset: {str(e)}"
-            return GAIADatasetManager.get_sample_questions(), error_msg
     @staticmethod
     def get_sample_questions() -> List[GAIAQuestion]:
-        """Get sample questions for testing"""
         sample_data = [
             {
                 "task_id": "sample_001",
@@ -317,10 +446,133 @@ class GAIADatasetManager:
                 "question": "How many continents are there?",
                 "level": 1,
                 "final_answer": "7"
             }
         ]
         return [GAIAQuestion.from_dict(data) for data in sample_data]
 # ================================
 # MAIN GAIA AGENT FOR HF SPACES
@@ -440,18 +692,22 @@ class GAIASpaceAgent:
         summary = f"""
 # 📊 GAIA Evaluation Summary
 ## Overall Statistics
 - **Total Questions**: {total}
 - **Successful**: {successful}
 - **Errors**: {errors}
 - **Success Rate**: {(successful/total*100):.1f}%
 ## Performance Metrics
 - **Average Processing Time**: {avg_time:.2f}s
 - **Total Processing Time**: {total_time:.2f}s
 - **Questions per Minute**: {(total/(total_time/60)):.1f}
 ## Model Information
 - **Model**: {self.current_model}
 - **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'}
 """
         return summary
@@ -464,11 +720,17 @@ class GAIASpaceAgent:
             detailed += f"""
 ## Question {i}: {question.task_id} {status}
 **Question**: {question.question}
 **Model Answer**: {result.final_answer}
 **Expected Answer**: {question.final_answer if question.final_answer else 'N/A'}
 **Processing Time**: {result.processing_time:.2f}s
 **Level**: {question.level}
 ---
 """
@@ -526,7 +788,7 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
         questions = GAIADatasetManager.get_sample_questions()
         status_msg = f"✅ Loaded {len(questions)} sample questions"
     else:
-        questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
     if max_questions and len(questions) > max_questions:
         questions = questions[:max_questions]
@@ -537,6 +799,138 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
     return summary, detailed, jsonl
 def get_model_info(model_choice: str):
     """Get information about selected model"""
     if model_choice in HFSpaceModelManager.SPACE_MODELS:
@@ -550,6 +944,17 @@ def get_model_info(model_choice: str):
 """
     return "Model information not available"
 # ================================
 # GRADIO APP CREATION
 # ================================
@@ -559,11 +964,26 @@ def create_gaia_app():
     with gr.Blocks(
         title="GAIA Benchmark AI Agent",
-        theme=gr.themes.Soft()
     ) as app:
         gr.HTML("""
-        <div style="text-align: center; font-size: 2.5em; font-weight: bold; margin-bottom: 20px;">
             🧠 GAIA Benchmark AI Agent
         </div>
         <p style="text-align: center; font-size: 1.2em; color: #666;">
@@ -571,9 +991,14 @@ def create_gaia_app():
         </p>
         """)
         with gr.Tabs():
             # TAB 1: MODEL SETUP
             with gr.Tab("🔧 Model Setup"):
                 gr.Markdown("## Choose and Load Your Model")
@@ -582,7 +1007,8 @@ def create_gaia_app():
                         model_dropdown = gr.Dropdown(
                             choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
                             value="Fast & Light",
-                            label="Select Model"
                         )
                         model_info = gr.Markdown(
@@ -594,9 +1020,12 @@ def create_gaia_app():
                     with gr.Column(scale=1):
                         gpu_info = gr.Markdown(f"""
-                        ### System Info
                         **CUDA Available**: {torch.cuda.is_available()}
                         {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
                         """)
                 model_status = gr.Textbox(
@@ -605,19 +1034,23 @@ def create_gaia_app():
                     interactive=False
                 )
                 model_dropdown.change(
                     fn=get_model_info,
                     inputs=[model_dropdown],
                     outputs=[model_info]
                 )
                 load_btn.click(
                     fn=load_model_interface,
                     inputs=[model_dropdown],
                     outputs=[model_status]
                 )
             # TAB 2: SINGLE QUESTION
             with gr.Tab("❓ Single Question"):
                 gr.Markdown("## Test Individual Questions")
@@ -631,7 +1064,8 @@ def create_gaia_app():
                         process_btn = gr.Button("🤔 Process Question", variant="primary")
-                        gr.Markdown("### Example Questions:")
                         example_questions = [
                             "What is the capital of France?",
                             "Calculate 144 divided by 12",
@@ -670,13 +1104,16 @@ def create_gaia_app():
                                 interactive=False
                             )
                 process_btn.click(
                     fn=single_question_interface,
                     inputs=[question_input],
                     outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
                 )
             # TAB 3: BATCH EVALUATION
             with gr.Tab("📊 Batch Evaluation"):
                 gr.Markdown("## Evaluate Multiple Questions")
@@ -684,15 +1121,17 @@ def create_gaia_app():
                     dataset_choice = gr.Radio(
                         choices=["Sample Questions", "GAIA Test Set"],
                         value="Sample Questions",
-                        label="Dataset Choice"
                     )
                     max_questions = gr.Slider(
                         minimum=1,
-                        maximum=50,
-                        value=5,
                         step=1,
-                        label="Max Questions"
                     )
                 evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
@@ -715,9 +1154,11 @@ def create_gaia_app():
                         value="Run an evaluation to see detailed results"
                     )
                 def batch_eval_with_download(*args):
                     summary, detailed, jsonl_content = batch_evaluate_interface(*args)
                     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                     filename = f"gaia_results_{timestamp}.jsonl"
@@ -735,96 +1176,250 @@ def create_gaia_app():
                     outputs=[download_output]
                 )
             # TAB 4: FULL BENCHMARK
             with gr.Tab("🏆 Full Benchmark"):
-                gr.Markdown("## Official GAIA Leaderboard Benchmark")
                 with gr.Row():
                     with gr.Column():
-                        test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
-                        test_preview_output = gr.Markdown(
-                            value="Click above to preview official test questions"
                         )
-                        dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
-                        dataset_structure_output = gr.Markdown(
-                            value="Click above to see actual GAIA dataset structure"
-                        )
                     with gr.Column():
-                        question_count = gr.Slider(
-                            minimum=10,
-                            maximum=300,
-                            value=20,
-                            step=10,
-                            label="Number of Questions"
-                        )
-                        selection_strategy = gr.Dropdown(
-                            choices=["balanced", "random", "sequential"],
-                            value="balanced",
-                            label="Selection Strategy"
-                        )
-                        benchmark_btn = gr.Button("🎯 Run Benchmark", variant="primary", size="lg")
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
-                    value="Ready to run benchmark",
                     interactive=False
                 )
                 with gr.Row():
                     with gr.Column():
                         benchmark_report = gr.Markdown(
-                            label="📈 Benchmark Report",
-                            value="Run benchmark to see detailed results"
                         )
                     with gr.Column():
                         submission_file = gr.File(
                             label="💾 Download Submission File (JSONL)",
                             visible=False
                         )
                         metadata_file = gr.File(
-                            label="📋 Download Metadata File",
                             visible=False
                         )
                 # Event handlers
-                test_preview_btn.click(
-                    fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
-                    outputs=[test_preview_output]
-                )
-                dataset_structure_btn.click(
-                    fn=preview_dataset_structure_interface,
-                    outputs=[dataset_structure_output]
                 )
-                def run_benchmark_wrapper(count, strategy, progress=gr.Progress()):
-                    return run_custom_benchmark_interface(count, strategy, progress)
-                def show_download_files(status, report, sub_file, meta_file):
                     return (
-                        status,
                         report,
-                        sub_file,
                         meta_file,
-                        gr.update(visible=True),
-                        gr.update(visible=True)
                     )
-                benchmark_btn.click(
-                    fn=run_benchmark_wrapper,
-                    inputs=[question_count, selection_strategy],
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
-                ).then(
-                    fn=show_download_files,
-                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
-                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
         return app
@@ -833,6 +1428,18 @@ def create_gaia_app():
 # ================================
 if __name__ == "__main__":
     app = create_gaia_app()
     app.launch(
         server_name="0.0.0.0",

 #!/usr/bin/env python3
 """
+GAIA Benchmark AI Agent - Complete Standalone Version
+===================================================
 A Gradio-based web interface for running GAIA benchmark evaluations
+with built-in dataset access and authentication.
 """
 import gradio as gr
     pipeline
 )
 from datasets import load_dataset
+from huggingface_hub import HfApi, hf_hub_download, list_repo_files
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# ================================
+# AUTHENTICATION SETUP
+# ================================
+def setup_hf_authentication():
+    """Setup HuggingFace authentication for GAIA dataset access"""
+    token = None
+    # Method 1: Environment variable
+    token = os.environ.get('HF_TOKEN')
+    if token:
+        logger.info("✅ Found HF_TOKEN in environment")
+        return token
+    # Method 2: HuggingFace CLI token
+    try:
+        from huggingface_hub import HfFolder
+        token = HfFolder.get_token()
+        if token:
+            logger.info("✅ Found token from HuggingFace CLI")
+            return token
+    except:
+        pass
+    # Method 3: Manual token file
+    token_path = os.path.expanduser("~/.cache/huggingface/token")
+    if os.path.exists(token_path):
+        try:
+            with open(token_path, 'r') as f:
+                token = f.read().strip()
+            if token:
+                logger.info("✅ Found token in cache file")
+                return token
+        except:
+            pass
+    logger.warning("⚠️ No HuggingFace token found - GAIA dataset access limited")
+    return None
+# Initialize authentication
+HF_TOKEN = setup_hf_authentication()
 # ================================
 # CORE DATA STRUCTURES
 # ================================
     """Manages GAIA-specific prompting and formatting"""
     GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
 FINAL ANSWER: [YOUR FINAL ANSWER]
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
     @staticmethod
             return f"❌ Error generating response: {str(e)}"
 # ================================
+# ENHANCED DATASET MANAGEMENT WITH GAIA ACCESS
 # ================================
 class GAIADatasetManager:
+    """Manages GAIA dataset loading with authentication and fallbacks"""
     @staticmethod
+    def test_gaia_access() -> Tuple[bool, str]:
+        """Test if we can access the GAIA dataset"""
+        if not HF_TOKEN:
+            return False, "No authentication token found"
         try:
+            # Try to load just one item to test access
+            dataset = load_dataset(
+                "gaia-benchmark/GAIA",
+                split="validation",
+                token=HF_TOKEN,
+                trust_remote_code=True
+            )
+            if len(dataset) > 0:
+                return True, f"✅ GAIA dataset accessible ({len(dataset)} validation questions)"
+            else:
+                return False, "Dataset empty"
+        except Exception as e:
+            return False, f"Access failed: {str(e)}"
+    @staticmethod
+    def get_gaia_splits() -> List[str]:
+        """Get available GAIA dataset splits"""
+        if not HF_TOKEN:
+            return []
+        try:
+            from datasets import get_dataset_config_names, get_dataset_split_names
+            splits = get_dataset_split_names("gaia-benchmark/GAIA", token=HF_TOKEN)
+            return splits
+        except:
+            # Common GAIA splits based on documentation
+            return ["validation", "test"]
+    @staticmethod
+    def load_gaia_dataset(split: str = "validation", max_questions: int = None) -> Tuple[List[GAIAQuestion], str]:
+        """Load GAIA dataset from Hugging Face Hub with robust error handling"""
+        try:
+            logger.info(f"Attempting to load GAIA dataset split: {split}")
+            if not HF_TOKEN:
+                logger.warning("No HF_TOKEN found, falling back to sample questions")
+                questions = GAIADatasetManager.get_sample_questions()
+                return questions[:max_questions] if max_questions else questions, "⚠️ No authentication - using sample questions"
+            # Test access first
+            has_access, access_msg = GAIADatasetManager.test_gaia_access()
+            if not has_access:
+                logger.warning(f"GAIA access test failed: {access_msg}")
+                questions = GAIADatasetManager.get_sample_questions()
+                return questions[:max_questions] if max_questions else questions, f"⚠️ {access_msg} - using sample questions"
+            # Load the actual dataset
+            dataset = load_dataset(
+                "gaia-benchmark/GAIA",
+                split=split,
+                token=HF_TOKEN,
+                trust_remote_code=True
+            )
+            logger.info(f"Successfully loaded GAIA dataset: {len(dataset)} items")
             questions = []
             items = dataset[:max_questions] if max_questions else dataset
             for i, item in enumerate(items):
+                # Handle different possible field names in GAIA dataset
+                task_id = (item.get('task_id') or
+                          item.get('Task ID') or
+                          item.get('id') or
+                          f'gaia_{split}_{i:03d}')
+                question_text = (item.get('Question') or
+                               item.get('question') or
+                               item.get('input') or
+                               'No question text available')
+                level = (item.get('Level') or
+                        item.get('level') or
+                        item.get('difficulty') or
+                        1)
+                final_answer = (item.get('Final answer') or
+                              item.get('final_answer') or
+                              item.get('answer') or
+                              item.get('target') or
+                              None)
+                file_name = (item.get('file_name') or
+                           item.get('File name') or
+                           item.get('files') or
+                           None)
+                annotator_metadata = (item.get('Annotator Metadata') or
+                                    item.get('annotator_metadata') or
+                                    item.get('metadata') or
+                                    None)
                 question = GAIAQuestion(
+                    task_id=str(task_id),
+                    question=str(question_text),
+                    level=int(level),
+                    final_answer=str(final_answer) if final_answer else None,
+                    file_name=str(file_name) if file_name else None,
+                    annotator_metadata=annotator_metadata
                 )
                 questions.append(question)
             status = f"✅ Loaded {len(questions)} questions from GAIA {split} split"
+            logger.info(status)
             return questions, status
         except Exception as e:
             error_msg = f"❌ Error loading GAIA dataset: {str(e)}"
+            logger.error(error_msg)
+            # Fallback to sample questions
+            logger.info("Falling back to sample questions")
+            questions = GAIADatasetManager.get_sample_questions()
+            return questions[:max_questions] if max_questions else questions, f"{error_msg} (Using sample questions instead)"
     @staticmethod
     def get_sample_questions() -> List[GAIAQuestion]:
+        """Get sample questions for testing when GAIA dataset is not accessible"""
         sample_data = [
             {
                 "task_id": "sample_001",
                 "question": "How many continents are there?",
                 "level": 1,
                 "final_answer": "7"
+            },
+            {
+                "task_id": "sample_009",
+                "question": "What is 25% of 200?",
+                "level": 1,
+                "final_answer": "50"
+            },
+            {
+                "task_id": "sample_010",
+                "question": "In which year did World War II end?",
+                "level": 1,
+                "final_answer": "1945"
+            },
+            {
+                "task_id": "sample_011",
+                "question": "What is the square root of 144?",
+                "level": 2,
+                "final_answer": "12"
+            },
+            {
+                "task_id": "sample_012",
+                "question": "Name the three primary colors.",
+                "level": 1,
+                "final_answer": "red, blue, yellow"
             }
         ]
         return [GAIAQuestion.from_dict(data) for data in sample_data]
+    @staticmethod
+    def preview_gaia_dataset() -> str:
+        """Preview GAIA dataset structure and content"""
+        if not HF_TOKEN:
+            return """
+## ⚠️ GAIA Dataset Preview - Authentication Required
+To access the GAIA dataset, you need:
+1. **Request Access**: https://huggingface.co/datasets/gaia-benchmark/GAIA
+2. **Get Token**: https://huggingface.co/settings/tokens
+3. **Set Token**: `export HF_TOKEN=your_token_here`
+### 📋 Sample Questions Available:
+We provide 12 sample questions for testing your setup without authentication.
+Use "Sample Questions" in the evaluation tabs to get started!
+"""
+        try:
+            # Test access and get basic info
+            has_access, access_msg = GAIADatasetManager.test_gaia_access()
+            if not has_access:
+                return f"""
+## ❌ GAIA Dataset Access Failed
+**Error**: {access_msg}
+### 🔧 Troubleshooting:
+1. Check your HF_TOKEN is valid
+2. Ensure you have access to GAIA dataset
+3. Try refreshing your token
+### 🔄 Alternative:
+Use "Sample Questions" for testing without authentication.
+"""
+            # Try to get some preview data
+            dataset = load_dataset(
+                "gaia-benchmark/GAIA",
+                split="validation",
+                token=HF_TOKEN,
+                trust_remote_code=True
+            )
+            # Analyze the dataset
+            total_questions = len(dataset)
+            # Get level distribution
+            levels = {}
+            sample_questions = []
+            for i, item in enumerate(dataset):
+                level = item.get('Level', 1)
+                levels[level] = levels.get(level, 0) + 1
+                # Collect a few sample questions
+                if len(sample_questions) < 3:
+                    question_text = item.get('Question', 'No question')
+                    if len(question_text) > 100:
+                        question_text = question_text[:100] + "..."
+                    sample_questions.append(f"- **Level {level}**: {question_text}")
+            level_dist = "\n".join([f"- **Level {k}**: {v} questions" for k, v in sorted(levels.items())])
+            sample_text = "\n".join(sample_questions)
+            return f"""
+## ✅ GAIA Dataset Preview - Access Confirmed
+### 📊 Dataset Statistics:
+- **Total Questions**: {total_questions}
+- **Available Split**: validation (development set)
+### 📈 Level Distribution:
+{level_dist}
+### 📋 Sample Questions:
+{sample_text}
+### 🎯 Ready for Evaluation!
+You can now use "GAIA Test Set" in the evaluation tabs to test your model on real GAIA questions.
+"""
+        except Exception as e:
+            return f"""
+## ❌ Error Previewing GAIA Dataset
+**Error**: {str(e)}
+### 🔄 Recommendations:
+1. Use "Sample Questions" for immediate testing
+2. Check your authentication setup
+3. Try again in a few minutes
+### 📞 Need Help?
+- GAIA Dataset: https://huggingface.co/datasets/gaia-benchmark/GAIA
+- HF Authentication: https://huggingface.co/docs/hub/security-tokens
+"""
 # ================================
 # MAIN GAIA AGENT FOR HF SPACES
         summary = f"""
 # 📊 GAIA Evaluation Summary
 ## Overall Statistics
 - **Total Questions**: {total}
 - **Successful**: {successful}
 - **Errors**: {errors}
 - **Success Rate**: {(successful/total*100):.1f}%
 ## Performance Metrics
 - **Average Processing Time**: {avg_time:.2f}s
 - **Total Processing Time**: {total_time:.2f}s
 - **Questions per Minute**: {(total/(total_time/60)):.1f}
 ## Model Information
 - **Model**: {self.current_model}
 - **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'}
+- **Authentication**: {'✅ GAIA Access' if HF_TOKEN else '⚠️ Sample Data Only'}
 """
         return summary
             detailed += f"""
 ## Question {i}: {question.task_id} {status}
 **Question**: {question.question}
 **Model Answer**: {result.final_answer}
 **Expected Answer**: {question.final_answer if question.final_answer else 'N/A'}
 **Processing Time**: {result.processing_time:.2f}s
 **Level**: {question.level}
 ---
 """
         questions = GAIADatasetManager.get_sample_questions()
         status_msg = f"✅ Loaded {len(questions)} sample questions"
     else:
+        questions, status_msg = GAIADatasetManager.load_gaia_dataset("validation", max_questions)
     if max_questions and len(questions) > max_questions:
         questions = questions[:max_questions]
     return summary, detailed, jsonl
+def run_full_benchmark_interface(progress=gr.Progress()):
+    """Run full benchmark on GAIA test set"""
+    try:
+        if gaia_agent.model_manager is None:
+            return (
+                "❌ No model loaded. Please load a model first.",
+                "Load a model in the 'Model Setup' tab before running benchmarks.",
+                None,
+                None
+            )
+        progress(0.1, desc="Loading GAIA test dataset...")
+        # Try to load the test set (or validation if test is not available)
+        test_questions, test_status = GAIADatasetManager.load_gaia_dataset("test", None)
+        if "Error" in test_status or not test_questions:
+            # Fallback to validation set
+            progress(0.15, desc="Test set not available, using validation set...")
+            test_questions, test_status = GAIADatasetManager.load_gaia_dataset("validation", None)
+        if not test_questions:
+            return (
+                "❌ No questions available for benchmarking",
+                "Unable to load GAIA dataset. Check your authentication and try 'Sample Questions' first.",
+                None,
+                None
+            )
+        progress(0.2, desc=f"Starting full benchmark on {len(test_questions)} questions...")
+        # Run the full evaluation
+        summary, detailed, jsonl_content = gaia_agent.batch_evaluate(test_questions, progress)
+        # Generate submission files
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Create submission file for leaderboard
+        submission_filename = f"gaia_submission_{timestamp}.jsonl"
+        with open(submission_filename, 'w') as f:
+            f.write(jsonl_content)
+        # Create metadata file
+        metadata = {
+            "submission_type": "full_benchmark",
+            "model_name": gaia_agent.current_model,
+            "timestamp": timestamp,
+            "num_questions": len(test_questions),
+            "dataset_split": "test" if "test" in test_status else "validation",
+            "dataset_status": test_status,
+            "device": gaia_agent.model_manager.device if gaia_agent.model_manager else "unknown",
+            "authentication": "authenticated" if HF_TOKEN else "sample_data"
+        }
+        metadata_filename = f"gaia_metadata_{timestamp}.json"
+        with open(metadata_filename, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        # Enhanced benchmark report
+        enhanced_summary = f"""
+# 🏆 GAIA Full Benchmark Results
+## 📊 Dataset Information
+{test_status}
+## 🎯 Benchmark Configuration
+- **Type**: Full GAIA Benchmark
+- **Questions**: {len(test_questions)}
+- **Model**: {gaia_agent.current_model}
+- **Dataset Split**: {"Test" if "test" in test_status else "Validation"}
+- **Timestamp**: {timestamp}
+{summary}
+## 📤 Leaderboard Submission Ready!
+Your benchmark is complete and submission files are ready:
+### 📁 Generated Files:
+- **Submission JSONL**: `{submission_filename}`
+- **Metadata JSON**: `{metadata_filename}`
+### 🚀 Next Steps:
+1. **Download** the JSONL file above
+2. **Visit** the GAIA Leaderboard: https://huggingface.co/spaces/gaia-benchmark/leaderboard
+3. **Upload** your submission file
+4. **View** your model's official ranking!
+## 🎯 Performance Context
+Your model will be ranked against:
+- **Top Models**: GPT-4 + plugins (~15-20%)
+- **Strong Models**: Claude-3, Gemini Pro (~10-18%)
+- **Human Performance**: ~92% accuracy
+- **Community Average**: ~5-15%
+Congratulations on completing the full GAIA benchmark! 🎉
+"""
+        return (
+            f"✅ Full benchmark completed! Evaluated {len(test_questions)} questions.",
+            enhanced_summary,
+            submission_filename,
+            metadata_filename
+        )
+    except Exception as e:
+        error_msg = f"❌ Full benchmark failed: {str(e)}"
+        logger.error(error_msg, exc_info=True)
+        return (
+            error_msg,
+            f"""
+# ❌ Benchmark Error
+**Error**: {str(e)}
+## 🔧 Troubleshooting Steps:
+1. **Load a model** in "Model Setup" tab first
+2. **Test with small batch** in "Batch Evaluation"
+3. **Use "Sample Questions"** to verify setup
+4. **Check authentication** if using GAIA dataset
+## 🔄 Alternative Approach:
+Try "Batch Evaluation" → "GAIA Test Set" → 10-20 questions first.
+""",
+            None,
+            None
+        )
+def preview_gaia_interface():
+    """Interface for previewing GAIA dataset"""
+    return GAIADatasetManager.preview_gaia_dataset()
 def get_model_info(model_choice: str):
     """Get information about selected model"""
     if model_choice in HFSpaceModelManager.SPACE_MODELS:
 """
     return "Model information not available"
+def get_auth_status():
+    """Get current authentication status"""
+    if HF_TOKEN:
+        has_access, msg = GAIADatasetManager.test_gaia_access()
+        if has_access:
+            return f"✅ **Authenticated & GAIA Access Confirmed**\n{msg}"
+        else:
+            return f"⚠️ **Authenticated but GAIA Access Failed**\n{msg}"
+    else:
+        return "❌ **Not Authenticated** - Using sample questions only\n\nTo access GAIA dataset:\n1. Get access: https://huggingface.co/datasets/gaia-benchmark/GAIA\n2. Set HF_TOKEN environment variable"
 # ================================
 # GRADIO APP CREATION
 # ================================
     with gr.Blocks(
         title="GAIA Benchmark AI Agent",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            font-family: 'Arial', sans-serif;
+        }
+        .main-header {
+            text-align: center;
+            background: linear-gradient(45deg, #2196F3, #21CBF3);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            font-size: 2.5em;
+            font-weight: bold;
+            margin-bottom: 20px;
+        }
+        """
     ) as app:
+        # Header
         gr.HTML("""
+        <div class="main-header">
             🧠 GAIA Benchmark AI Agent
         </div>
         <p style="text-align: center; font-size: 1.2em; color: #666;">
         </p>
         """)
+        # Authentication status at the top
+        auth_status_display = gr.Markdown(value=get_auth_status())
         with gr.Tabs():
+            # ===============================
             # TAB 1: MODEL SETUP
+            # ===============================
             with gr.Tab("🔧 Model Setup"):
                 gr.Markdown("## Choose and Load Your Model")
                         model_dropdown = gr.Dropdown(
                             choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
                             value="Fast & Light",
+                            label="Select Model",
+                            info="Choose based on your quality vs speed preference"
                         )
                         model_info = gr.Markdown(
                     with gr.Column(scale=1):
                         gpu_info = gr.Markdown(f"""
+                        ### 🖥️ System Info
                         **CUDA Available**: {torch.cuda.is_available()}
                         {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
+                        ### 🔐 Dataset Access
+                        {get_auth_status()}
                         """)
                 model_status = gr.Textbox(
                     interactive=False
                 )
+                # Update model info when selection changes
                 model_dropdown.change(
                     fn=get_model_info,
                     inputs=[model_dropdown],
                     outputs=[model_info]
                 )
+                # Load model when button clicked
                 load_btn.click(
                     fn=load_model_interface,
                     inputs=[model_dropdown],
                     outputs=[model_status]
                 )
+            # ===============================
             # TAB 2: SINGLE QUESTION
+            # ===============================
             with gr.Tab("❓ Single Question"):
                 gr.Markdown("## Test Individual Questions")
                         process_btn = gr.Button("🤔 Process Question", variant="primary")
+                        # Example questions
+                        gr.Markdown("### 💡 Example Questions:")
                         example_questions = [
                             "What is the capital of France?",
                             "Calculate 144 divided by 12",
                                 interactive=False
                             )
+                # Process single question
                 process_btn.click(
                     fn=single_question_interface,
                     inputs=[question_input],
                     outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
                 )
+            # ===============================
             # TAB 3: BATCH EVALUATION
+            # ===============================
             with gr.Tab("📊 Batch Evaluation"):
                 gr.Markdown("## Evaluate Multiple Questions")
                     dataset_choice = gr.Radio(
                         choices=["Sample Questions", "GAIA Test Set"],
                         value="Sample Questions",
+                        label="Dataset Choice",
+                        info="Sample Questions work without authentication"
                     )
                     max_questions = gr.Slider(
                         minimum=1,
+                        maximum=100,
+                        value=10,
                         step=1,
+                        label="Max Questions",
+                        info="Number of questions to evaluate"
                     )
                 evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
                         value="Run an evaluation to see detailed results"
                     )
+                # Batch evaluation
                 def batch_eval_with_download(*args):
                     summary, detailed, jsonl_content = batch_evaluate_interface(*args)
+                    # Save JSONL for download
                     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                     filename = f"gaia_results_{timestamp}.jsonl"
                     outputs=[download_output]
                 )
+            # ===============================
             # TAB 4: FULL BENCHMARK
+            # ===============================
             with gr.Tab("🏆 Full Benchmark"):
+                gr.Markdown("## Official GAIA Benchmark & Leaderboard Submission")
                 with gr.Row():
                     with gr.Column():
+                        # GAIA dataset preview
+                        preview_btn = gr.Button("🔍 Preview GAIA Dataset", variant="secondary")
+                        preview_output = gr.Markdown(
+                            value="Click above to preview the GAIA dataset structure and access status"
                         )
                     with gr.Column():
+                        gr.Markdown("""
+                        ### 🏆 GAIA Leaderboard Info
+                        **What is GAIA?**
+                        - 450+ real-world assistant questions
+                        - 3 difficulty levels (basic → advanced)
+                        - Requires reasoning, tool use, multi-modality
+                        **Current Leaderboard:**
+                        - **Best Models**: ~15-20% accuracy
+                        - **Human Performance**: ~92% accuracy
+                        - **Your Goal**: Beat the current best!
+                        **Official Leaderboard:**
+                        https://huggingface.co/spaces/gaia-benchmark/leaderboard
+                        """)
+                gr.Markdown("### 🚀 Run Full Benchmark")
+                gr.Markdown("""
+                **⚠️ Important Notes:**
+                - This will evaluate your model on the complete GAIA dataset
+                - May take 1-3 hours depending on model and hardware
+                - Generates official leaderboard submission files
+                - Test with smaller batches first to verify your setup
+                """)
+                full_benchmark_btn = gr.Button(
+                    "🏆 Start Full GAIA Benchmark",
+                    variant="primary",
+                    size="lg"
+                )
+                # Results section
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
+                    value="Ready to run full benchmark",
                     interactive=False
                 )
                 with gr.Row():
                     with gr.Column():
                         benchmark_report = gr.Markdown(
+                            label="📈 Benchmark Report",
+                            value="Run benchmark to see detailed results and leaderboard submission files"
                         )
                     with gr.Column():
+                        # Download files
                         submission_file = gr.File(
                             label="💾 Download Submission File (JSONL)",
                             visible=False
                         )
                         metadata_file = gr.File(
+                            label="📋 Download Metadata File",
                             visible=False
                         )
+                        gr.Markdown("""
+                        ### 📤 Leaderboard Submission Steps
+                        1. **Run** the full benchmark above
+                        2. **Download** the JSONL submission file
+                        3. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
+                        4. **Upload** your submission file
+                        5. **View** your official ranking!
+                        """)
                 # Event handlers
+                preview_btn.click(
+                    fn=preview_gaia_interface,
+                    outputs=[preview_output]
                 )
+                def full_benchmark_with_files(*args):
+                    status, report, sub_file, meta_file = run_full_benchmark_interface(*args)
                     return (
+                        status,
                         report,
+                        sub_file,
                         meta_file,
+                        gr.update(visible=True if sub_file else False),
+                        gr.update(visible=True if meta_file else False)
                     )
+                full_benchmark_btn.click(
+                    fn=full_benchmark_with_files,
+                    outputs=[
+                        benchmark_status,
+                        benchmark_report,
+                        submission_file,
+                        metadata_file,
+                        submission_file,  # Update visibility
+                        metadata_file     # Update visibility
+                    ]
                 )
+            # ===============================
+            # TAB 5: HELP & INFO
+            # ===============================
+            with gr.Tab("ℹ️ Help & Info"):
+                gr.Markdown("""
+                # 🧠 GAIA Benchmark AI Agent - Complete Guide
+                ## 🎯 Quick Start Guide
+                ### 1. **Model Setup** (Required First!)
+                - Choose a model based on your needs
+                - **Fast & Light**: Good for testing, works on CPU
+                - **High Quality**: Best results, requires GPU
+                - Click "Load Model" and wait for success message
+                ### 2. **Test Your Setup**
+                - Go to "Single Question" tab
+                - Try example questions like "What is the capital of France?"
+                - Verify your model responds correctly
+                ### 3. **Small Batch Test**
+                - Go to "Batch Evaluation" tab
+                - Select "Sample Questions" (works without authentication)
+                - Start with 5-10 questions
+                - Check that evaluation completes and files download
+                ### 4. **GAIA Dataset Access** (Optional but Recommended)
+                ```bash
+                # Get your token from https://huggingface.co/settings/tokens
+                export HF_TOKEN=hf_your_token_here
+                # Or login via CLI
+                huggingface-cli login
+                ```
+                ### 5. **Full Benchmark** (Advanced)
+                - Go to "Full Benchmark" tab
+                - Preview GAIA dataset to confirm access
+                - Run complete evaluation for leaderboard submission
+                ## 📊 What is GAIA?
+                **GAIA (General AI Assistant)** tests AI on real-world tasks requiring:
+                - **Multi-step reasoning**: Complex logical thinking
+                - **Tool use**: Web browsing, calculations, file processing
+                - **Multi-modality**: Text, images, PDFs, spreadsheets
+                - **Real-world knowledge**: Current events, specialized domains
+                ## 🏆 Performance Expectations
+                | Model Type | Expected Accuracy | Notes |
+                |------------|------------------|-------|
+                | **Top Commercial** | 15-20% | GPT-4 + plugins, Claude-3 |
+                | **Good Open Source** | 8-15% | Llama-2-70B, Mixtral-8x7B |
+                | **Smaller Models** | 3-8% | 7B parameter models |
+                | **Basic Models** | 1-5% | 3B parameter models |
+                | **Humans** | ~92% | Average human performance |
+                ## 🔧 Troubleshooting
+                ### Model Loading Issues
+                - **Out of Memory**: Try "Fast & Light" model
+                - **CUDA Errors**: Restart and use CPU mode
+                - **Download Fails**: Check internet connection
+                ### Dataset Access Issues
+                - **401 Unauthorized**: Set HF_TOKEN environment variable
+                - **403 Forbidden**: Request GAIA dataset access first
+                - **No Results**: Use "Sample Questions" to test setup
+                ### Evaluation Issues
+                - **No Progress**: Ensure model is loaded first
+                - **Errors**: Check model compatibility and memory
+                - **Slow Performance**: Normal for larger models/datasets
+                ## 📁 File Formats
+                **Submission JSONL Format:**
+                ```json
+                {"task_id": "gaia_001", "model_answer": "Full response...", "reasoning_trace": "Step by step..."}
+                {"task_id": "gaia_002", "model_answer": "Full response...", "reasoning_trace": "Step by step..."}
+                ```
+                **Metadata JSON Format:**
+                ```json
+                {
+                  "model_name": "High Quality",
+                  "timestamp": "20240604_143022",
+                  "num_questions": 450,
+                  "dataset_split": "test"
+                }
+                ```
+                ## 🚀 Pro Tips
+                ### For Best Results:
+                1. **Start Small**: Always test with sample questions first
+                2. **Monitor Resources**: Check GPU memory during evaluation
+                3. **Save Progress**: Download intermediate results frequently
+                4. **Quality Over Speed**: Use better models for leaderboard submissions
+                5. **Analyze Failures**: Review reasoning traces to understand errors
+                ### For Leaderboard Success:
+                1. **Test Thoroughly**: Verify setup with small batches
+                2. **Use Best Model**: Don't compromise on model quality
+                3. **Check Format**: Ensure JSONL files are valid
+                4. **Include Metadata**: Helps with debugging and analysis
+                5. **Document Approach**: Note any special techniques used
+                ## 🔗 Important Links
+                - **GAIA Dataset**: https://huggingface.co/datasets/gaia-benchmark/GAIA
+                - **GAIA Leaderboard**: https://huggingface.co/spaces/gaia-benchmark/leaderboard
+                - **GAIA Paper**: https://arxiv.org/abs/2311.12983
+                - **HuggingFace Tokens**: https://huggingface.co/settings/tokens
+                - **Authentication Guide**: https://huggingface.co/docs/hub/security-tokens
+                ## 🎉 Success Checklist
+                - [ ] Model loads successfully
+                - [ ] Single question works
+                - [ ] Batch evaluation completes
+                - [ ] Files download properly
+                - [ ] GAIA dataset access (optional)
+                - [ ] Full benchmark completes
+                - [ ] Submission files ready
+                - [ ] Uploaded to leaderboard
+                ---
+                **Ready to benchmark?** Start with Model Setup and work through each tab systematically. Good luck! 🚀
+                """)
         return app
 # ================================
 if __name__ == "__main__":
+    # Print startup information
+    print("🧠 GAIA Benchmark AI Agent Starting...")
+    print(f"🔐 Authentication: {'✅ Found HF_TOKEN' if HF_TOKEN else '⚠️ No HF_TOKEN (sample questions only)'}")
+    print(f"🖥️ CUDA Available: {'✅ Yes' if torch.cuda.is_available() else '❌ No (CPU only)'}")
+    if torch.cuda.is_available():
+        print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
+    # Test GAIA access if token available
+    if HF_TOKEN:
+        has_access, access_msg = GAIADatasetManager.test_gaia_access()
+        print(f"📊 GAIA Dataset: {'✅ Accessible' if has_access else '⚠️ ' + access_msg}")
     app = create_gaia_app()
     app.launch(
         server_name="0.0.0.0",