Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
GAIA Benchmark AI Agent - With HF Token Input Interface | |
===================================================== | |
Enhanced version with user token input for GAIA dataset access | |
""" | |
import gradio as gr | |
import torch | |
import json | |
import os | |
import logging | |
import time | |
import re | |
from datetime import datetime | |
from typing import Dict, List, Optional, Tuple, Any | |
from dataclasses import dataclass | |
import pandas as pd | |
from pathlib import Path | |
# Core ML libraries | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForCausalLM, | |
BitsAndBytesConfig, | |
pipeline | |
) | |
from datasets import load_dataset | |
from huggingface_hub import HfApi, hf_hub_download, list_repo_files | |
# Setup logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# ================================ | |
# ENHANCED AUTHENTICATION SETUP | |
# ================================ | |
class HFTokenManager: | |
"""Manages HuggingFace token for GAIA dataset access""" | |
def __init__(self): | |
self.current_token = None | |
self.token_status = "No token set" | |
self.gaia_access_status = "Not tested" | |
def set_token(self, token: str) -> Tuple[str, str]: | |
"""Set and validate HF token""" | |
if not token or not token.strip(): | |
self.current_token = None | |
self.token_status = "❌ No token provided" | |
self.gaia_access_status = "Not tested" | |
return self.token_status, self.gaia_access_status | |
token = token.strip() | |
# Basic token format validation | |
if not token.startswith('hf_'): | |
self.current_token = None | |
self.token_status = "❌ Invalid token format (should start with 'hf_')" | |
self.gaia_access_status = "Not tested" | |
return self.token_status, self.gaia_access_status | |
try: | |
# Test token validity | |
api = HfApi(token=token) | |
user_info = api.whoami() | |
self.current_token = token | |
self.token_status = f"✅ Valid token for user: {user_info['name']}" | |
# Test GAIA dataset access | |
try: | |
dataset_info = api.dataset_info("gaia-benchmark/GAIA", token=token) | |
available_splits = list(dataset_info.splits.keys()) if dataset_info.splits else [] | |
self.gaia_access_status = f"✅ GAIA access confirmed (splits: {', '.join(available_splits)})" | |
except Exception as e: | |
if "401" in str(e) or "403" in str(e): | |
self.gaia_access_status = "❌ GAIA access denied - request access at: https://huggingface.co/datasets/gaia-benchmark/GAIA" | |
else: | |
self.gaia_access_status = f"⚠️ GAIA access test failed: {str(e)}" | |
return self.token_status, self.gaia_access_status | |
except Exception as e: | |
self.current_token = None | |
if "401" in str(e): | |
self.token_status = "❌ Invalid token - check your token is correct" | |
else: | |
self.token_status = f"❌ Token validation failed: {str(e)}" | |
self.gaia_access_status = "Not tested" | |
return self.token_status, self.gaia_access_status | |
def get_token(self) -> Optional[str]: | |
"""Get current valid token""" | |
return self.current_token | |
def test_gaia_access(self) -> Tuple[bool, str]: | |
"""Test GAIA dataset access with current token""" | |
if not self.current_token: | |
return False, "No valid token set" | |
try: | |
# Try to load a small sample from validation set | |
dataset = load_dataset( | |
"gaia-benchmark/GAIA", | |
split="validation", | |
token=self.current_token, | |
trust_remote_code=True | |
) | |
if len(dataset) > 0: | |
return True, f"✅ GAIA dataset accessible ({len(dataset)} validation questions)" | |
else: | |
return False, "Dataset appears empty" | |
except Exception as e: | |
return False, f"Access failed: {str(e)}" | |
# Global token manager | |
token_manager = HFTokenManager() | |
# Legacy HF_TOKEN setup with fallback | |
def setup_hf_authentication(): | |
"""Setup HuggingFace authentication with environment fallback""" | |
env_token = os.environ.get('HF_TOKEN') | |
if env_token: | |
token_manager.set_token(env_token) | |
logger.info("✅ Found HF_TOKEN in environment") | |
return env_token | |
# Try HuggingFace CLI token | |
try: | |
from huggingface_hub import HfFolder | |
cli_token = HfFolder.get_token() | |
if cli_token: | |
token_manager.set_token(cli_token) | |
logger.info("✅ Found token from HuggingFace CLI") | |
return cli_token | |
except: | |
pass | |
# Try manual token file | |
token_path = os.path.expanduser("~/.cache/huggingface/token") | |
if os.path.exists(token_path): | |
try: | |
with open(token_path, 'r') as f: | |
file_token = f.read().strip() | |
if file_token: | |
token_manager.set_token(file_token) | |
logger.info("✅ Found token in cache file") | |
return file_token | |
except: | |
pass | |
logger.warning("⚠️ No HuggingFace token found - use interface to set token") | |
return None | |
# Initialize with environment token if available | |
INITIAL_TOKEN = setup_hf_authentication() | |
# ================================ | |
# CORE DATA STRUCTURES (unchanged) | |
# ================================ | |
class GAIAQuestion: | |
"""Structure for GAIA benchmark questions""" | |
task_id: str | |
question: str | |
level: int | |
final_answer: Optional[str] = None | |
file_name: Optional[str] = None | |
annotator_metadata: Optional[Dict] = None | |
def from_dict(cls, data: dict): | |
return cls(**{k: v for k, v in data.items() if k in cls.__annotations__}) | |
class GAIAResponse: | |
"""Structure for GAIA responses""" | |
task_id: str | |
model_answer: str | |
reasoning_trace: str | |
final_answer: str | |
processing_time: float = 0.0 | |
confidence_score: float = 0.0 | |
# ================================ | |
# GAIA PROMPT MANAGEMENT (unchanged) | |
# ================================ | |
class GAIAPromptManager: | |
"""Manages GAIA-specific prompting and formatting""" | |
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: | |
FINAL ANSWER: [YOUR FINAL ANSWER] | |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.""" | |
def create_gaia_prompt(question: str) -> str: | |
"""Create properly formatted GAIA prompt""" | |
return f"{GAIAPromptManager.GAIA_SYSTEM_PROMPT}\n\nQuestion: {question}\n\nLet me think step by step:" | |
def extract_final_answer(response: str) -> Tuple[str, str]: | |
"""Extract final answer and reasoning from model response""" | |
final_answer_pattern = r"FINAL ANSWER:\s*(.+?)(?:\n|$)" | |
match = re.search(final_answer_pattern, response, re.IGNORECASE | re.DOTALL) | |
if match: | |
final_answer = match.group(1).strip() | |
reasoning_end = match.start() | |
reasoning = response[:reasoning_end].strip() | |
else: | |
lines = response.strip().split('\n') | |
final_answer = lines[-1].strip() if lines else "" | |
reasoning = '\n'.join(lines[:-1]) if len(lines) > 1 else response | |
return final_answer, reasoning | |
# ================================ | |
# MODEL MANAGER (unchanged) | |
# ================================ | |
class HFSpaceModelManager: | |
"""Hugging Face Spaces optimized model manager""" | |
SPACE_MODELS = { | |
"Fast & Light": { | |
"name": "microsoft/DialoGPT-medium", | |
"size": "~345MB", | |
"speed": "Fast", | |
"quality": "Good", | |
"gpu_required": False | |
}, | |
"Balanced": { | |
"name": "stabilityai/stablelm-zephyr-3b", | |
"size": "~3GB", | |
"speed": "Medium", | |
"quality": "Better", | |
"gpu_required": True | |
}, | |
"High Quality": { | |
"name": "HuggingFaceH4/zephyr-7b-beta", | |
"size": "~7GB", | |
"speed": "Slower", | |
"quality": "Best", | |
"gpu_required": True | |
}, | |
"Instruction Following": { | |
"name": "mistralai/Mistral-7B-Instruct-v0.1", | |
"size": "~7GB", | |
"speed": "Medium", | |
"quality": "Excellent", | |
"gpu_required": True | |
} | |
} | |
def __init__(self, model_choice: str = "Fast & Light"): | |
self.model_config = self.SPACE_MODELS[model_choice] | |
self.model_name = self.model_config["name"] | |
self.tokenizer = None | |
self.model = None | |
self.pipeline = None | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
def load_model(self, progress_callback=None) -> str: | |
"""Load model with progress updates""" | |
try: | |
if progress_callback: | |
progress_callback(0.1, "Loading tokenizer...") | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
if self.tokenizer.pad_token is None: | |
self.tokenizer.pad_token = self.tokenizer.eos_token | |
if progress_callback: | |
progress_callback(0.3, "Configuring model...") | |
quantization_config = None | |
if self.device == "cuda" and "7b" in self.model_name.lower(): | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_compute_dtype=torch.float16, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4" | |
) | |
if progress_callback: | |
progress_callback(0.6, "Loading model weights...") | |
self.model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
quantization_config=quantization_config, | |
device_map="auto" if self.device == "cuda" else None, | |
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
trust_remote_code=True | |
) | |
if progress_callback: | |
progress_callback(0.9, "Creating pipeline...") | |
self.pipeline = pipeline( | |
"text-generation", | |
model=self.model, | |
tokenizer=self.tokenizer, | |
max_new_tokens=384, | |
temperature=0.7, | |
do_sample=True, | |
pad_token_id=self.tokenizer.eos_token_id, | |
device=0 if self.device == "cuda" else -1 | |
) | |
if progress_callback: | |
progress_callback(1.0, "Model loaded successfully!") | |
return f"✅ Model '{self.model_name}' loaded successfully on {self.device.upper()}" | |
except Exception as e: | |
error_msg = f"❌ Error loading model: {str(e)}" | |
logger.error(error_msg) | |
return error_msg | |
def generate_response(self, prompt: str, max_tokens: int = 384) -> str: | |
"""Generate response with error handling""" | |
if self.pipeline is None: | |
return "❌ Model not loaded. Please load a model first." | |
try: | |
max_input_length = 1000 | |
if len(prompt) > max_input_length: | |
prompt = prompt[:max_input_length] + "..." | |
outputs = self.pipeline( | |
prompt, | |
max_new_tokens=max_tokens, | |
temperature=0.7, | |
do_sample=True, | |
return_full_text=False, | |
pad_token_id=self.tokenizer.eos_token_id | |
) | |
response = outputs[0]['generated_text'].strip() | |
return response | |
except Exception as e: | |
return f"❌ Error generating response: {str(e)}" | |
# ================================ | |
# ENHANCED DATASET MANAGEMENT WITH TOKEN SUPPORT | |
# ================================ | |
class GAIADatasetManager: | |
"""Manages GAIA dataset loading with user token support""" | |
def load_gaia_dataset(split: str = "validation", max_questions: int = None, use_token: bool = True) -> Tuple[List[GAIAQuestion], str]: | |
"""Load GAIA dataset with token support""" | |
try: | |
logger.info(f"Attempting to load GAIA dataset split: {split}") | |
current_token = token_manager.get_token() if use_token else None | |
if use_token and not current_token: | |
logger.warning("No valid token found, falling back to sample questions") | |
questions = GAIADatasetManager.get_sample_questions() | |
return questions[:max_questions] if max_questions else questions, "⚠️ No authentication token - using sample questions" | |
# Test access first if using token | |
if use_token: | |
has_access, access_msg = token_manager.test_gaia_access() | |
if not has_access: | |
logger.warning(f"GAIA access test failed: {access_msg}") | |
questions = GAIADatasetManager.get_sample_questions() | |
return questions[:max_questions] if max_questions else questions, f"⚠️ {access_msg} - using sample questions" | |
# Load the actual dataset | |
dataset = load_dataset( | |
"gaia-benchmark/GAIA", | |
split=split, | |
token=current_token, | |
trust_remote_code=True | |
) | |
logger.info(f"Successfully loaded GAIA dataset: {len(dataset)} items") | |
questions = [] | |
items = dataset[:max_questions] if max_questions else dataset | |
for i, item in enumerate(items): | |
# Handle different possible field names in GAIA dataset | |
task_id = (item.get('task_id') or | |
item.get('Task ID') or | |
item.get('id') or | |
f'gaia_{split}_{i:03d}') | |
question_text = (item.get('Question') or | |
item.get('question') or | |
item.get('input') or | |
'No question text available') | |
level = (item.get('Level') or | |
item.get('level') or | |
item.get('difficulty') or | |
1) | |
final_answer = (item.get('Final answer') or | |
item.get('final_answer') or | |
item.get('answer') or | |
item.get('target') or | |
None) | |
file_name = (item.get('file_name') or | |
item.get('File name') or | |
item.get('files') or | |
None) | |
annotator_metadata = (item.get('Annotator Metadata') or | |
item.get('annotator_metadata') or | |
item.get('metadata') or | |
None) | |
question = GAIAQuestion( | |
task_id=str(task_id), | |
question=str(question_text), | |
level=int(level), | |
final_answer=str(final_answer) if final_answer else None, | |
file_name=str(file_name) if file_name else None, | |
annotator_metadata=annotator_metadata | |
) | |
questions.append(question) | |
status = f"✅ Loaded {len(questions)} questions from GAIA {split} split" | |
logger.info(status) | |
return questions, status | |
except Exception as e: | |
error_msg = f"❌ Error loading GAIA dataset: {str(e)}" | |
logger.error(error_msg) | |
# Fallback to sample questions | |
logger.info("Falling back to sample questions") | |
questions = GAIADatasetManager.get_sample_questions() | |
return questions[:max_questions] if max_questions else questions, f"{error_msg} (Using sample questions instead)" | |
def get_sample_questions() -> List[GAIAQuestion]: | |
"""Get sample questions for testing when GAIA dataset is not accessible""" | |
sample_data = [ | |
{ | |
"task_id": "sample_001", | |
"question": "What is the capital of France?", | |
"level": 1, | |
"final_answer": "Paris" | |
}, | |
{ | |
"task_id": "sample_002", | |
"question": "Calculate 144 divided by 12.", | |
"level": 1, | |
"final_answer": "12" | |
}, | |
{ | |
"task_id": "sample_003", | |
"question": "What is the largest planet in our solar system?", | |
"level": 1, | |
"final_answer": "Jupiter" | |
}, | |
{ | |
"task_id": "sample_004", | |
"question": "Convert 100 degrees Celsius to Fahrenheit.", | |
"level": 2, | |
"final_answer": "212" | |
}, | |
{ | |
"task_id": "sample_005", | |
"question": "List the first three even numbers greater than zero.", | |
"level": 1, | |
"final_answer": "2, 4, 6" | |
}, | |
{ | |
"task_id": "sample_006", | |
"question": "What year did the Berlin Wall fall?", | |
"level": 1, | |
"final_answer": "1989" | |
}, | |
{ | |
"task_id": "sample_007", | |
"question": "What is the chemical symbol for water?", | |
"level": 1, | |
"final_answer": "H2O" | |
}, | |
{ | |
"task_id": "sample_008", | |
"question": "How many continents are there?", | |
"level": 1, | |
"final_answer": "7" | |
}, | |
{ | |
"task_id": "sample_009", | |
"question": "What is 25% of 200?", | |
"level": 1, | |
"final_answer": "50" | |
}, | |
{ | |
"task_id": "sample_010", | |
"question": "In which year did World War II end?", | |
"level": 1, | |
"final_answer": "1945" | |
}, | |
{ | |
"task_id": "sample_011", | |
"question": "What is the square root of 144?", | |
"level": 2, | |
"final_answer": "12" | |
}, | |
{ | |
"task_id": "sample_012", | |
"question": "Name the three primary colors.", | |
"level": 1, | |
"final_answer": "red, blue, yellow" | |
} | |
] | |
return [GAIAQuestion.from_dict(data) for data in sample_data] | |
# ================================ | |
# MAIN GAIA AGENT (updated with token support) | |
# ================================ | |
class GAIASpaceAgent: | |
"""Main GAIA agent with token support""" | |
def __init__(self): | |
self.model_manager = None | |
self.prompt_manager = GAIAPromptManager() | |
self.current_model = None | |
self.evaluation_results: List[GAIAResponse] = [] | |
def initialize_model(self, model_choice: str, progress=gr.Progress()) -> str: | |
"""Initialize model with progress tracking""" | |
try: | |
progress(0, desc="Initializing model manager...") | |
self.model_manager = HFSpaceModelManager(model_choice) | |
self.current_model = model_choice | |
def progress_callback(value, desc): | |
progress(value, desc=desc) | |
result = self.model_manager.load_model(progress_callback) | |
self.evaluation_results = [] | |
return result | |
except Exception as e: | |
return f"❌ Failed to initialize model: {str(e)}" | |
def process_single_question(self, question_text: str, progress=gr.Progress()) -> Tuple[str, str, str, float]: | |
"""Process a single question with detailed output""" | |
if self.model_manager is None or self.model_manager.pipeline is None: | |
return "❌ No model loaded", "", "", 0.0 | |
start_time = time.time() | |
try: | |
progress(0.2, desc="Creating GAIA prompt...") | |
prompt = self.prompt_manager.create_gaia_prompt(question_text) | |
progress(0.4, desc="Generating response...") | |
raw_response = self.model_manager.generate_response(prompt) | |
progress(0.8, desc="Extracting final answer...") | |
final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response) | |
processing_time = time.time() - start_time | |
progress(1.0, desc="Complete!") | |
return final_answer, raw_response, reasoning, processing_time | |
except Exception as e: | |
processing_time = time.time() - start_time | |
error_msg = f"❌ Error processing question: {str(e)}" | |
return error_msg, "", "", processing_time | |
def batch_evaluate(self, questions: List[GAIAQuestion], progress=gr.Progress()) -> Tuple[str, str, str]: | |
"""Evaluate multiple questions with progress tracking""" | |
if self.model_manager is None: | |
return "❌ No model loaded", "", "" | |
results = [] | |
total_questions = len(questions) | |
progress(0, desc=f"Starting evaluation of {total_questions} questions...") | |
for i, question in enumerate(questions): | |
try: | |
progress((i + 1) / total_questions, | |
desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}") | |
start_time = time.time() | |
prompt = self.prompt_manager.create_gaia_prompt(question.question) | |
raw_response = self.model_manager.generate_response(prompt) | |
final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response) | |
processing_time = time.time() - start_time | |
response = GAIAResponse( | |
task_id=question.task_id, | |
model_answer=raw_response, | |
reasoning_trace=reasoning, | |
final_answer=final_answer, | |
processing_time=processing_time | |
) | |
results.append(response) | |
self.evaluation_results.append(response) | |
except Exception as e: | |
logger.error(f"Error processing {question.task_id}: {e}") | |
error_response = GAIAResponse( | |
task_id=question.task_id, | |
model_answer=f"Error: {str(e)}", | |
reasoning_trace="Processing failed", | |
final_answer="ERROR", | |
processing_time=0.0 | |
) | |
results.append(error_response) | |
self.evaluation_results.append(error_response) | |
summary = self._generate_summary(results) | |
detailed_results = self._generate_detailed_results(results, questions) | |
jsonl_content = self._generate_jsonl(results) | |
return summary, detailed_results, jsonl_content | |
def _generate_summary(self, results: List[GAIAResponse]) -> str: | |
"""Generate evaluation summary""" | |
total = len(results) | |
errors = sum(1 for r in results if r.final_answer == "ERROR") | |
successful = total - errors | |
avg_time = sum(r.processing_time for r in results) / total if total > 0 else 0 | |
total_time = sum(r.processing_time for r in results) | |
auth_status = "✅ GAIA Access" if token_manager.get_token() else "⚠️ Sample Data Only" | |
summary = f""" | |
# 📊 GAIA Evaluation Summary | |
## Overall Statistics | |
- **Total Questions**: {total} | |
- **Successful**: {successful} | |
- **Errors**: {errors} | |
- **Success Rate**: {(successful/total*100):.1f}% | |
## Performance Metrics | |
- **Average Processing Time**: {avg_time:.2f}s | |
- **Total Processing Time**: {total_time:.2f}s | |
- **Questions per Minute**: {(total/(total_time/60)):.1f} | |
## Model Information | |
- **Model**: {self.current_model} | |
- **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'} | |
- **Authentication**: {auth_status} | |
""" | |
return summary | |
def _generate_detailed_results(self, results: List[GAIAResponse], questions: List[GAIAQuestion]) -> str: | |
"""Generate detailed results breakdown""" | |
detailed = "# 📋 Detailed Results\n\n" | |
for i, (result, question) in enumerate(zip(results, questions), 1): | |
status = "✅" if result.final_answer != "ERROR" else "❌" | |
detailed += f""" | |
## Question {i}: {question.task_id} {status} | |
**Question**: {question.question} | |
**Model Answer**: {result.final_answer} | |
**Expected Answer**: {question.final_answer if question.final_answer else 'N/A'} | |
**Processing Time**: {result.processing_time:.2f}s | |
**Level**: {question.level} | |
--- | |
""" | |
return detailed | |
def _generate_jsonl(self, results: List[GAIAResponse]) -> str: | |
"""Generate JSONL format for download""" | |
jsonl_lines = [] | |
for result in results: | |
line = { | |
"task_id": result.task_id, | |
"model_answer": result.model_answer, | |
"reasoning_trace": result.reasoning_trace | |
} | |
jsonl_lines.append(json.dumps(line)) | |
return '\n'.join(jsonl_lines) | |
# ================================ | |
# GLOBAL AGENT INSTANCE | |
# ================================ | |
gaia_agent = GAIASpaceAgent() | |
# ================================ | |
# ENHANCED GRADIO INTERFACE FUNCTIONS | |
# ================================ | |
def set_hf_token_interface(token: str): | |
"""Interface function for setting HF token""" | |
token_status, gaia_status = token_manager.set_token(token) | |
return token_status, gaia_status, update_auth_status() | |
def update_auth_status(): | |
"""Update authentication status display""" | |
if token_manager.get_token(): | |
return f"""### 🔐 Authentication Status | |
{token_manager.token_status} | |
### 📊 GAIA Dataset Access | |
{token_manager.gaia_access_status} | |
### 💡 Usage | |
- ✅ Can access GAIA validation/test sets | |
- ✅ Can download official benchmark data | |
- ✅ Results suitable for leaderboard submission""" | |
else: | |
return """### 🔐 Authentication Status | |
❌ No valid HF token set | |
### 📊 GAIA Dataset Access | |
❌ Cannot access GAIA dataset - using sample questions | |
### 💡 To Access GAIA Dataset: | |
1. **Get Access**: Visit https://huggingface.co/datasets/gaia-benchmark/GAIA | |
2. **Get Token**: Visit https://huggingface.co/settings/tokens | |
3. **Set Token**: Enter your token in the field above""" | |
def load_model_interface(model_choice: str, progress=gr.Progress()): | |
"""Interface function for model loading""" | |
return gaia_agent.initialize_model(model_choice, progress) | |
def single_question_interface(question: str, progress=gr.Progress()): | |
"""Interface function for single question processing""" | |
if not question.strip(): | |
return "Please enter a question", "", "", "0.00s" | |
final_answer, full_response, reasoning, proc_time = gaia_agent.process_single_question(question, progress) | |
return ( | |
final_answer, | |
full_response, | |
reasoning, | |
f"{proc_time:.2f}s" | |
) | |
def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=gr.Progress()): | |
"""Interface function for batch evaluation""" | |
if gaia_agent.model_manager is None: | |
return "❌ Please load a model first", "", "" | |
progress(0.1, desc="Loading dataset...") | |
if dataset_choice == "Sample Questions": | |
questions = GAIADatasetManager.get_sample_questions() | |
status_msg = f"✅ Loaded {len(questions)} sample questions" | |
else: | |
use_token = dataset_choice in ["GAIA Validation Set", "GAIA Test Set"] | |
split = "test" if dataset_choice == "GAIA Test Set" else "validation" | |
questions, status_msg = GAIADatasetManager.load_gaia_dataset(split, max_questions, use_token) | |
if max_questions and len(questions) > max_questions: | |
questions = questions[:max_questions] | |
progress(0.2, desc=f"{status_msg}. Starting evaluation...") | |
summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress) | |
return summary, detailed, jsonl | |
def get_model_info(model_choice: str): | |
"""Get information about selected model""" | |
if model_choice in HFSpaceModelManager.SPACE_MODELS: | |
config = HFSpaceModelManager.SPACE_MODELS[model_choice] | |
return f""" | |
**Model**: {config['name']} | |
**Size**: {config['size']} | |
**Speed**: {config['speed']} | |
**Quality**: {config['quality']} | |
**GPU Required**: {'Yes' if config['gpu_required'] else 'No'} | |
""" | |
return "Model information not available" | |
def preview_gaia_interface(): | |
"""Interface for previewing GAIA dataset with token support""" | |
if not token_manager.get_token(): | |
return """ | |
## ⚠️ GAIA Dataset Preview - Authentication Required | |
To access the GAIA dataset, you need: | |
1. **Request Access**: https://huggingface.co/datasets/gaia-benchmark/GAIA | |
2. **Get Token**: https://huggingface.co/settings/tokens | |
3. **Set Token**: Enter your token in the Authentication tab above | |
### 📋 Sample Questions Available: | |
We provide 12 sample questions for testing your setup without authentication. | |
Use "Sample Questions" in the evaluation tabs to get started! | |
""" | |
try: | |
# Test access and get basic info | |
has_access, access_msg = token_manager.test_gaia_access() | |
if not has_access: | |
return f""" | |
## ❌ GAIA Dataset Access Failed | |
**Error**: {access_msg} | |
### 🔧 Troubleshooting: | |
1. Check your HF_TOKEN is valid | |
2. Ensure you have access to GAIA dataset | |
3. Try refreshing your token | |
### 🔄 Alternative: | |
Use "Sample Questions" for testing without authentication. | |
""" | |
# Try to get some preview data | |
dataset = load_dataset( | |
"gaia-benchmark/GAIA", | |
split="validation", | |
token=token_manager.get_token(), | |
trust_remote_code=True | |
) | |
# Analyze the dataset | |
total_questions = len(dataset) | |
# Get level distribution | |
levels = {} | |
sample_questions = [] | |
for i, item in enumerate(dataset): | |
level = item.get('Level', 1) | |
levels[level] = levels.get(level, 0) + 1 | |
# Collect a few sample questions | |
if len(sample_questions) < 3: | |
question_text = item.get('Question', 'No question') | |
if len(question_text) > 100: | |
question_text = question_text[:100] + "..." | |
sample_questions.append(f"- **Level {level}**: {question_text}") | |
level_dist = "\n".join([f"- **Level {k}**: {v} questions" for k, v in sorted(levels.items())]) | |
sample_text = "\n".join(sample_questions) | |
return f""" | |
## ✅ GAIA Dataset Preview - Access Confirmed | |
### 📊 Dataset Statistics: | |
- **Total Questions**: {total_questions} | |
- **Available Split**: validation (development set) | |
### 📈 Level Distribution: | |
{level_dist} | |
### 📋 Sample Questions: | |
{sample_text} | |
### 🎯 Ready for Evaluation! | |
You can now use "GAIA Validation Set" or "GAIA Test Set" in the evaluation tabs to test your model on real GAIA questions. | |
""" | |
except Exception as e: | |
return f""" | |
## ❌ Error Previewing GAIA Dataset | |
**Error**: {str(e)} | |
### 🔄 Recommendations: | |
1. Use "Sample Questions" for immediate testing | |
2. Check your authentication setup | |
3. Try again in a few minutes | |
### 📞 Need Help? | |
- GAIA Dataset: https://huggingface.co/datasets/gaia-benchmark/GAIA | |
- HF Authentication: https://huggingface.co/docs/hub/security-tokens | |
""" | |
# ================================ | |
# ENHANCED GRADIO APP CREATION WITH TOKEN INPUT | |
# ================================ | |
def create_gaia_app(): | |
"""Create the main Gradio application with token input""" | |
with gr.Blocks( | |
title="GAIA Benchmark AI Agent", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
font-family: 'Arial', sans-serif; | |
} | |
.main-header { | |
text-align: center; | |
background: linear-gradient(45deg, #2196F3, #21CBF3); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
font-size: 2.5em; | |
font-weight: bold; | |
margin-bottom: 20px; | |
} | |
.auth-section { | |
background: #f8f9fa; | |
padding: 15px; | |
border-radius: 10px; | |
border-left: 4px solid #2196F3; | |
margin: 10px 0; | |
} | |
""" | |
) as app: | |
# Header | |
gr.HTML(""" | |
<div class="main-header"> | |
🧠 GAIA Benchmark AI Agent | |
</div> | |
<p style="text-align: center; font-size: 1.2em; color: #666;"> | |
Evaluate AI models on the GAIA benchmark with step-by-step reasoning | |
</p> | |
""") | |
with gr.Tabs(): | |
# =============================== | |
# TAB 1: AUTHENTICATION | |
# =============================== | |
with gr.Tab("🔐 Authentication"): | |
gr.HTML('<div class="auth-section">') | |
gr.Markdown("## HuggingFace Token Setup") | |
gr.Markdown(""" | |
**To access the GAIA dataset, you need:** | |
1. **Request access** to GAIA dataset | |
2. **Get your HuggingFace token** | |
3. **Enter token below** | |
""") | |
gr.HTML('</div>') | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.Markdown("### 🔑 Enter Your HuggingFace Token") | |
hf_token_input = gr.Textbox( | |
label="HuggingFace Token", | |
placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", | |
type="password", | |
info="Get your token from https://huggingface.co/settings/tokens", | |
value="" | |
) | |
set_token_btn = gr.Button("🔓 Set Token & Test Access", variant="primary") | |
with gr.Row(): | |
token_status = gr.Textbox( | |
label="Token Status", | |
value="No token set", | |
interactive=False, | |
lines=1 | |
) | |
gaia_access_status = gr.Textbox( | |
label="GAIA Access Status", | |
value="Not tested", | |
interactive=False, | |
lines=1 | |
) | |
with gr.Column(scale=1): | |
auth_status_display = gr.Markdown( | |
value=update_auth_status(), | |
label="Authentication Status" | |
) | |
gr.Markdown(""" | |
### 📋 Step-by-Step Setup Guide | |
#### 1. Request GAIA Dataset Access | |
- Visit: https://huggingface.co/datasets/gaia-benchmark/GAIA | |
- Click **"Request Access"** button | |
- Fill out the form explaining your use case | |
- Wait for approval (usually within 24 hours) | |
#### 2. Get Your HuggingFace Token | |
- Go to: https://huggingface.co/settings/tokens | |
- Click **"New token"** | |
- Choose **"Read"** permissions | |
- Copy the token (starts with `hf_`) | |
#### 3. Enter Token Above | |
- Paste your token in the field above | |
- Click **"Set Token & Test Access"** | |
- Verify both token validity and GAIA access | |
### ⚠️ Token Security | |
- Your token is only stored in memory during this session | |
- Never share your token publicly | |
- You can revoke tokens at any time from HuggingFace settings | |
### 🔄 Without Authentication | |
- You can still use **12 sample questions** for testing | |
- All features work except real GAIA dataset access | |
- Perfect for getting familiar with the interface | |
""") | |
# Set token event | |
set_token_btn.click( | |
fn=set_hf_token_interface, | |
inputs=[hf_token_input], | |
outputs=[token_status, gaia_access_status, auth_status_display] | |
) | |
# =============================== | |
# TAB 2: MODEL SETUP | |
# =============================== | |
with gr.Tab("🔧 Model Setup"): | |
gr.Markdown("## Choose and Load Your Model") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
model_dropdown = gr.Dropdown( | |
choices=list(HFSpaceModelManager.SPACE_MODELS.keys()), | |
value="Fast & Light", | |
label="Select Model", | |
info="Choose based on your quality vs speed preference" | |
) | |
model_info = gr.Markdown( | |
value=get_model_info("Fast & Light"), | |
label="Model Information" | |
) | |
load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
gpu_info = gr.Markdown(f""" | |
### 🖥️ System Info | |
**CUDA Available**: {torch.cuda.is_available()} | |
{f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"} | |
### 🔐 Authentication Status | |
{"✅ Token Set" if token_manager.get_token() else "⚠️ No Token - Go to Authentication tab"} | |
""") | |
model_status = gr.Textbox( | |
label="Model Status", | |
value="No model loaded", | |
interactive=False | |
) | |
# Update model info when selection changes | |
model_dropdown.change( | |
fn=get_model_info, | |
inputs=[model_dropdown], | |
outputs=[model_info] | |
) | |
# Load model when button clicked | |
load_btn.click( | |
fn=load_model_interface, | |
inputs=[model_dropdown], | |
outputs=[model_status] | |
) | |
# =============================== | |
# TAB 3: SINGLE QUESTION | |
# =============================== | |
with gr.Tab("❓ Single Question"): | |
gr.Markdown("## Test Individual Questions") | |
with gr.Row(): | |
with gr.Column(): | |
question_input = gr.Textbox( | |
label="Enter your question", | |
placeholder="e.g., What is the capital of France?", | |
lines=3 | |
) | |
process_btn = gr.Button("🤔 Process Question", variant="primary") | |
# Example questions | |
gr.Markdown("### 💡 Example Questions:") | |
example_questions = [ | |
"What is the capital of France?", | |
"Calculate 144 divided by 12", | |
"What is the largest planet in our solar system?", | |
"Convert 100 degrees Celsius to Fahrenheit" | |
] | |
for example in example_questions: | |
gr.Button(f"📝 {example}", size="sm").click( | |
lambda x=example: x, | |
outputs=[question_input] | |
) | |
with gr.Column(): | |
final_answer_output = gr.Textbox( | |
label="🎯 Final Answer", | |
interactive=False | |
) | |
processing_time = gr.Textbox( | |
label="⏱️ Processing Time", | |
interactive=False | |
) | |
with gr.Accordion("🧠 Full Response", open=False): | |
full_response = gr.Textbox( | |
label="Complete Model Response", | |
lines=8, | |
interactive=False | |
) | |
with gr.Accordion("🔍 Reasoning Trace", open=False): | |
reasoning_trace = gr.Textbox( | |
label="Step-by-step Reasoning", | |
lines=6, | |
interactive=False | |
) | |
# Process single question | |
process_btn.click( | |
fn=single_question_interface, | |
inputs=[question_input], | |
outputs=[final_answer_output, full_response, reasoning_trace, processing_time] | |
) | |
# =============================== | |
# TAB 4: BATCH EVALUATION | |
# =============================== | |
with gr.Tab("📊 Batch Evaluation"): | |
gr.Markdown("## Evaluate Multiple Questions") | |
with gr.Row(): | |
dataset_choice = gr.Radio( | |
choices=["Sample Questions", "GAIA Validation Set", "GAIA Test Set"], | |
value="Sample Questions", | |
label="Dataset Choice", | |
info="Sample Questions work without authentication" | |
) | |
max_questions = gr.Slider( | |
minimum=1, | |
maximum=300, | |
value=10, | |
step=1, | |
label="Max Questions", | |
info="Number of questions to evaluate" | |
) | |
evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg") | |
# Dataset info display | |
with gr.Row(): | |
gr.Markdown(""" | |
### 📊 Dataset Information | |
**Sample Questions (No Auth Required)**: | |
- 12 curated questions for testing | |
- Works without HuggingFace token | |
- Perfect for setup verification | |
**GAIA Validation Set (Auth Required)**: | |
- ~165 official validation questions | |
- Good for model development | |
- May include reference answers | |
**GAIA Test Set (Auth Required)**: | |
- ~450 official test questions | |
- Used for leaderboard submissions | |
- Answers typically hidden (blind evaluation) | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
summary_output = gr.Markdown( | |
label="📊 Evaluation Summary", | |
value="No evaluation completed yet" | |
) | |
with gr.Column(): | |
download_output = gr.File( | |
label="💾 Download Results (JSONL)", | |
visible=False | |
) | |
with gr.Accordion("📋 Detailed Results", open=False): | |
detailed_output = gr.Markdown( | |
value="Run an evaluation to see detailed results" | |
) | |
# Batch evaluation with download | |
def batch_eval_with_download(*args): | |
summary, detailed, jsonl_content = batch_evaluate_interface(*args) | |
# Save JSONL for download | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
dataset_name = args[0].lower().replace(" ", "_") | |
filename = f"gaia_{dataset_name}_{timestamp}.jsonl" | |
with open(filename, 'w') as f: | |
f.write(jsonl_content) | |
return summary, detailed, filename | |
evaluate_btn.click( | |
fn=batch_eval_with_download, | |
inputs=[dataset_choice, max_questions], | |
outputs=[summary_output, detailed_output, download_output] | |
).then( | |
lambda: gr.update(visible=True), | |
outputs=[download_output] | |
) | |
# =============================== | |
# TAB 5: DATASET PREVIEW | |
# =============================== | |
with gr.Tab("📋 Dataset Preview"): | |
gr.Markdown("## GAIA Dataset Information") | |
preview_btn = gr.Button("🔍 Preview GAIA Dataset", variant="primary") | |
preview_output = gr.Markdown( | |
value="Click above to preview the GAIA dataset structure and your access status" | |
) | |
gr.Markdown(""" | |
## 🎯 About GAIA Benchmark | |
**GAIA (General AI Assistant)** is a comprehensive benchmark for evaluating AI assistants on real-world tasks that require: | |
### 🧠 Key Capabilities Tested: | |
- **Multi-step reasoning**: Complex logical thinking and problem decomposition | |
- **Tool use**: Web browsing, calculations, file processing | |
- **Multi-modality**: Text, images, PDFs, spreadsheets, audio files | |
- **Real-world knowledge**: Current events, specialized domains | |
- **Following instructions**: Precise output formatting | |
### 📊 Dataset Structure: | |
- **Total Questions**: ~450 in test set, ~165 in validation set | |
- **Difficulty Levels**: | |
- Level 1: Basic questions (≤30 seconds for humans) | |
- Level 2: Intermediate (≤5 minutes for humans) | |
- Level 3: Advanced (≤30 minutes for humans) | |
- **Question Types**: Factual, mathematical, reasoning, research tasks | |
### 🏆 Current Leaderboard (Top Performers): | |
1. **GPT-4 + plugins**: ~20% accuracy | |
2. **Claude-3 Opus**: ~15% accuracy | |
3. **Gemini Pro**: ~12% accuracy | |
4. **Human Performance**: ~92% accuracy | |
### 📁 File Types in GAIA: | |
- Text documents, PDFs | |
- Images (charts, diagrams, photos) | |
- Spreadsheets (CSV, Excel) | |
- Audio files | |
- Web pages and URLs | |
### 🎯 Evaluation Criteria: | |
- **Exact Match**: Final answer must match exactly | |
- **Case Sensitive**: Proper formatting required | |
- **No Partial Credit**: Binary scoring (correct/incorrect) | |
- **Format Specific**: Numbers vs strings vs lists handled differently | |
### 🔬 Research Impact: | |
- Used in 50+ research papers | |
- Standard benchmark for assistant evaluation | |
- Drives development of reasoning capabilities | |
- Identifies gaps in current AI systems | |
""") | |
preview_btn.click( | |
fn=preview_gaia_interface, | |
outputs=[preview_output] | |
) | |
# =============================== | |
# TAB 6: HELP & INFO | |
# =============================== | |
with gr.Tab("ℹ️ Help & Info"): | |
gr.Markdown(""" | |
# 🧠 GAIA Benchmark AI Agent - Complete Guide | |
## 🎯 Quick Start Guide | |
### 1. **Authentication** (For GAIA Dataset Access) | |
- Go to "Authentication" tab | |
- Get access to GAIA dataset: https://huggingface.co/datasets/gaia-benchmark/GAIA | |
- Get HF token: https://huggingface.co/settings/tokens | |
- Enter token and test access | |
### 2. **Model Setup** (Required!) | |
- Go to "Model Setup" tab | |
- Choose a model based on your needs: | |
- **Fast & Light**: Good for testing, works on CPU | |
- **High Quality**: Best results, requires GPU | |
- Click "Load Model" and wait for success message | |
### 3. **Test Your Setup** | |
- Go to "Single Question" tab | |
- Try example questions like "What is the capital of France?" | |
- Verify your model responds correctly | |
### 4. **Batch Evaluation** | |
- Go to "Batch Evaluation" tab | |
- Start with "Sample Questions" (no auth needed) | |
- Try 5-10 questions first | |
- Download results for analysis | |
### 5. **GAIA Dataset** | |
- Use "Dataset Preview" to check access | |
- Try "GAIA Validation Set" for development | |
- Use "GAIA Test Set" for leaderboard submission | |
## 📊 Dataset Options Explained | |
### Sample Questions (Always Available) | |
- **12 curated questions** for testing | |
- **No authentication required** | |
- Perfect for verifying your setup | |
- Good for debugging and development | |
### GAIA Validation Set (Requires Auth) | |
- **~165 official questions** from GAIA | |
- Good for **model development** and tuning | |
- May include reference answers for comparison | |
- Faster to evaluate than full test set | |
### GAIA Test Set (Requires Auth) | |
- **~450 official questions** from GAIA | |
- Used for **official leaderboard** submissions | |
- Answers typically hidden (blind evaluation) | |
- Takes longer but gives official ranking | |
## 🏆 Performance Expectations | |
| Model Type | Expected Accuracy | Use Case | | |
|------------|------------------|----------| | |
| **Top Commercial** | 15-20% | GPT-4 + plugins, research | | |
| **Strong Models** | 10-15% | Claude-3, Gemini Pro | | |
| **Good Open Source** | 5-10% | Llama-2-70B, Mixtral | | |
| **Smaller Models** | 1-5% | 7B parameter models | | |
| **Humans** | ~92% | Reference performance | | |
## 🔧 Troubleshooting | |
### Authentication Issues | |
- **"Invalid token"**: Check token format (starts with `hf_`) | |
- **"Access denied"**: Request GAIA dataset access first | |
- **"Token not found"**: Get token from HF settings | |
### Model Issues | |
- **Out of Memory**: Try "Fast & Light" model | |
- **CUDA Errors**: Restart and use CPU mode | |
- **Slow loading**: Normal for large models, be patient | |
### Evaluation Issues | |
- **No responses**: Ensure model is loaded first | |
- **All errors**: Check model compatibility | |
- **Slow evaluation**: Normal for complex questions | |
## 📁 Output Files | |
### JSONL Format (Leaderboard Ready) | |
```json | |
{"task_id": "gaia_001", "model_answer": "Complete response...", "reasoning_trace": "Step by step..."} | |
{"task_id": "gaia_002", "model_answer": "Complete response...", "reasoning_trace": "Step by step..."} | |
``` | |
### Key Fields: | |
- **task_id**: Unique question identifier | |
- **model_answer**: Full model response | |
- **reasoning_trace**: Step-by-step thinking process | |
## 🚀 Best Practices | |
### For Accuracy: | |
1. **Use best model**: Don't compromise on model quality | |
2. **Test prompts**: Verify prompt format works | |
3. **Check reasoning**: Review step-by-step traces | |
4. **Analyze failures**: Learn from incorrect answers | |
### For Efficiency: | |
1. **Start small**: Test with 5-10 questions first | |
2. **Monitor resources**: Watch GPU/CPU usage | |
3. **Save progress**: Download results frequently | |
4. **Use appropriate model**: Match model to available hardware | |
### For Leaderboard: | |
1. **Use test set**: Official ranking requires test set | |
2. **Validate format**: Check JSONL is properly formatted | |
3. **Document approach**: Note any special techniques | |
4. **Submit promptly**: Upload to official leaderboard | |
## 🔗 Important Links | |
- **GAIA Dataset**: https://huggingface.co/datasets/gaia-benchmark/GAIA | |
- **GAIA Leaderboard**: https://huggingface.co/spaces/gaia-benchmark/leaderboard | |
- **GAIA Paper**: https://arxiv.org/abs/2311.12983 | |
- **HuggingFace Tokens**: https://huggingface.co/settings/tokens | |
- **Authentication Guide**: https://huggingface.co/docs/hub/security-tokens | |
""") | |
return app | |
# ================================ | |
# MAIN APPLICATION | |
# ================================ | |
if __name__ == "__main__": | |
# Print startup information | |
print("🧠 GAIA Benchmark AI Agent Starting...") | |
print(f"🔐 Environment Token: {'✅ Found' if INITIAL_TOKEN else '⚠️ Not found'}") | |
print(f"🖥️ CUDA Available: {'✅ Yes' if torch.cuda.is_available() else '❌ No (CPU only)'}") | |
if torch.cuda.is_available(): | |
print(f"🎮 GPU: {torch.cuda.get_device_name(0)}") | |
print(""" | |
💡 Token Setup Options: | |
1. Environment: export HF_TOKEN=hf_your_token | |
2. Interface: Enter token in Authentication tab | |
3. CLI: huggingface-cli login | |
""") | |
app = create_gaia_app() | |
app.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False | |
) |