Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Setup script for Hugging Face Space deployment | |
Ensures GPQA benchmark can run successfully on HF | |
""" | |
import os | |
import sys | |
import subprocess | |
from pathlib import Path | |
def create_deployment_files(): | |
"""Create necessary files for HF deployment""" | |
print("π Setting up Hugging Face Space deployment...") | |
# 1. Update requirements.txt with HF dependencies | |
requirements_path = Path("requirements.txt") | |
existing_reqs = requirements_path.read_text() if requirements_path.exists() else "" | |
hf_deps = [ | |
"huggingface_hub>=0.20.0", | |
"gradio>=4.31.0", | |
"python-dotenv>=0.19.0" | |
] | |
for dep in hf_deps: | |
if dep.split(">=")[0] not in existing_reqs: | |
existing_reqs += f"\n{dep}" | |
requirements_path.write_text(existing_reqs.strip() + "\n") | |
print("β Updated requirements.txt") | |
# 2. Create .env.example | |
env_example = """# Hugging Face Space Configuration | |
# Copy this to .env or set in HF Secrets | |
# Required: Your Grok API key from x.ai | |
GROK_API_KEY=your_grok_api_key_here | |
# Required: Your Hugging Face token for GPQA dataset access | |
# Get it from: https://huggingface.co/settings/tokens | |
HF_TOKEN=your_hugging_face_token_here | |
# Optional: OpenAI and Anthropic keys for comparison | |
# OPENAI_API_KEY=your_openai_key_here | |
# ANTHROPIC_API_KEY=your_anthropic_key_here | |
""" | |
with open(".env.example", "w") as f: | |
f.write(env_example) | |
print("β Created .env.example") | |
# 3. Create HF-specific run script | |
run_script = """#!/usr/bin/env python3 | |
\"\"\" | |
Hugging Face Space entry point for GPQA evaluation | |
\"\"\" | |
import os | |
import sys | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Set HF token if available | |
hf_token = os.getenv('HF_TOKEN') | |
if hf_token: | |
os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token | |
print("β HF Token configured") | |
# Import and run the app | |
from app import create_ui, start_evaluation_safe, check_environment | |
if __name__ == "__main__": | |
# Check environment | |
issues = check_environment() | |
if issues: | |
print("\\nβ οΈ Configuration issues:") | |
for issue in issues: | |
print(f" - {issue}") | |
print("\\nThe app will run in demo mode.") | |
print("To enable GPQA evaluation, please set the required secrets in HF Space settings.") | |
else: | |
print("β All environment variables configured") | |
# Start evaluation in background | |
start_evaluation_safe() | |
# Create and launch UI | |
ui = create_ui() | |
ui.launch() | |
""" | |
with open("run_hf_space.py", "w") as f: | |
f.write(run_script) | |
os.chmod("run_hf_space.py", 0o755) | |
print("β Created run_hf_space.py") | |
# 4. Create README for HF Space | |
readme_content = """--- | |
title: Grok-4 GPQA Evaluation | |
emoji: π§ | |
colorFrom: blue | |
colorTo: green | |
sdk: gradio | |
sdk_version: "4.31.0" | |
app_file: run_hf_space.py | |
pinned: false | |
--- | |
# Grok-4 GPQA Evaluation Dashboard | |
Real-time evaluation of Grok-4 model on the GPQA (Graduate-Level Google-Proof Q&A) benchmark. | |
## π§ Configuration | |
This Space requires the following secrets to be set in your HF Space settings: | |
1. **GROK_API_KEY** (Required) | |
- Get from: https://x.ai | |
- Your Grok API key for running evaluations | |
2. **HF_TOKEN** (Required) | |
- Get from: https://huggingface.co/settings/tokens | |
- Required for accessing the GPQA dataset | |
- Make sure you have requested access to: https://huggingface.co/datasets/Idavidrein/gpqa | |
## π Features | |
- Real-time progress tracking | |
- Accuracy metrics and performance stats | |
- Detailed results export | |
- Support for full GPQA dataset (448 questions) | |
## π Quick Start | |
1. Fork this Space | |
2. Set the required secrets in your Space settings | |
3. The evaluation will start automatically | |
4. Monitor progress in the dashboard | |
## β οΈ Known Issues | |
- GPQA dataset requires access approval (usually 1-2 days) | |
- Grok-4-0709 uses extensive reasoning tokens (~2500-3000 per question) | |
- Full evaluation takes ~3-4 hours due to model response times | |
## π Expected Performance | |
Based on our testing: | |
- Accuracy: ~80-90% (excluding timeouts) | |
- Avg Response Time: ~50s per question | |
- Total Runtime: ~3-4 hours for full dataset | |
""" | |
with open("README_HF.md", "w") as f: | |
f.write(readme_content) | |
print("β Created README_HF.md") | |
# 5. Create pre-flight check script | |
check_script = """#!/usr/bin/env python3 | |
\"\"\" | |
Pre-deployment checklist for HF Space | |
\"\"\" | |
import os | |
import sys | |
from pathlib import Path | |
def check_deployment_ready(): | |
\"\"\"Check if everything is ready for HF deployment\"\"\" | |
print("π Pre-deployment checklist:\\n") | |
checks = [] | |
# Check files exist | |
required_files = [ | |
"app.py", | |
"run_evaluation.py", | |
"requirements.txt", | |
".env.example", | |
"run_hf_space.py", | |
"official_config.yaml" | |
] | |
for file in required_files: | |
if Path(file).exists(): | |
checks.append((f"β {file} exists", True)) | |
else: | |
checks.append((f"β {file} missing", False)) | |
# Check API directories | |
if Path("apis").is_dir() and list(Path("apis").glob("*.py")): | |
checks.append(("β APIs directory configured", True)) | |
else: | |
checks.append(("β APIs directory missing or empty", False)) | |
# Check benchmarks directory | |
if Path("benchmarks").is_dir() and Path("benchmarks/gpqa_benchmark.py").exists(): | |
checks.append(("β GPQA benchmark implementation found", True)) | |
else: | |
checks.append(("β GPQA benchmark missing", False)) | |
# Check for sensitive data | |
if Path(".env").exists(): | |
checks.append(("β οΈ .env file exists - make sure it's in .gitignore!", None)) | |
# Print results | |
for check, status in checks: | |
print(check) | |
all_good = all(status is not False for _, status in checks) | |
if all_good: | |
print("\\nβ Ready for deployment!") | |
print("\\nNext steps:") | |
print("1. Set GROK_API_KEY and HF_TOKEN in HF Space secrets") | |
print("2. Make sure you have GPQA dataset access") | |
print("3. Push to Hugging Face") | |
else: | |
print("\\nβ Issues found - please fix before deploying") | |
return all_good | |
if __name__ == "__main__": | |
check_deployment_ready() | |
""" | |
with open("check_deployment.py", "w") as f: | |
f.write(check_script) | |
os.chmod("check_deployment.py", 0o755) | |
print("β Created check_deployment.py") | |
print("\nπ Deployment files created successfully!") | |
print("\nNext steps:") | |
print("1. Run: python check_deployment.py") | |
print("2. Set your API keys in HF Space secrets") | |
print("3. Push to Hugging Face") | |
if __name__ == "__main__": | |
create_deployment_files() |