#!/usr/bin/env python3 import os import pandas as pd from huggingface_hub import snapshot_download from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN def download_datasets(): """Download datasets from HuggingFace repositories""" print("Downloading datasets from HuggingFace...") # Download eval requests (queue) try: print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}") snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) print("✓ Eval requests downloaded successfully") except Exception as e: print(f"Error downloading eval requests: {e}") # Download eval results try: print(f"Downloading eval results to {EVAL_RESULTS_PATH}") snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) print("✓ Eval results downloaded successfully") except Exception as e: print(f"Error downloading eval results: {e}") # Download prediction data (main dataset) try: print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}") snapshot_download( repo_id=DATA_REPO, local_dir=PREDICTIONS_CSV_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) print("✓ Prediction data downloaded successfully") except Exception as e: print(f"Error downloading prediction data: {e}") def process_data(): """Process the downloaded data and create queue""" print("Processing downloaded data...") # Load the main dataset csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv") if not os.path.exists(csv_path): print(f"Error: data.csv not found at {csv_path}") return None, None print(f"Loading data from {csv_path}") df = pd.read_csv(csv_path) # Convert date columns df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"]) df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"]) print(f"Loaded {len(df)} records") print(f"Data shape: {df.shape}") print(f"Columns: {list(df.columns)}") # Get unique dates for prediction windows prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique()) print(f"Prediction dates: {prediction_dates}") # Get unique algorithms/models algorithms = df["algorithm_name"].unique() print(f"Algorithms: {algorithms}") # Get unique event types event_types = df["event_type"].unique() print(f"Event types: {event_types}") # Create a summary of the data summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()} print("\n=== Data Summary ===") for key, value in summary.items(): print(f"{key}: {value}") return df, summary def generate_queue(df): """Generate evaluation queue from processed data""" print("Generating evaluation queue...") # Get unique events that need evaluation unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index() # Filter for events that haven't been resolved yet (if needed) pending_events = unique_events[unique_events["result"].isna()] resolved_events = unique_events[unique_events["result"].notna()] print(f"Total unique events: {len(unique_events)}") print(f"Pending events: {len(pending_events)}") print(f"Resolved events: {len(resolved_events)}") # Save queue locally queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv") unique_events.to_csv(queue_path, index=False) print(f"✓ Queue saved to {queue_path}") return unique_events def main(): """Main function to download and process data""" print("=== FutureBench Data Download and Processing ===") # Download datasets download_datasets() # Process data df, summary = process_data() if df is None: print("❌ Failed to process data. Exiting.") return # Generate queue queue = generate_queue(df) print("\n=== Processing Complete ===") print("Data processed and queue generated successfully!") print(f"Queue contains {len(queue)} events") if __name__ == "__main__": main()