from datetime import timedelta import pandas as pd from .about import Tasks from .display_utils import format_percentage, make_clickable_model def clean_model_name(model_name: str) -> str: """Clean up model names for better display""" if model_name.startswith("smolagents-tavily-web-visit-"): return "Agent Baseline " + model_name.removeprefix("smolagents-tavily-web-visit-") if model_name.startswith("language-model-"): return "Language Model " + model_name.removeprefix("language-model-") return model_name def get_available_weeks(predictions_df): """Get list of available weeks from the data""" if predictions_df is None or predictions_df.empty: return [] # Get unique dates and convert to weeks dates = predictions_df["open_to_bet_until"].dt.date.unique() weeks = {} for date in dates: # Get the Monday of the week for this date monday = date - timedelta(days=date.weekday()) week_end = monday + timedelta(days=6) week_key = f"{monday} to {week_end}" week_range = (monday, week_end) weeks[week_key] = week_range # Sort by date sorted_weeks = sorted(weeks.items(), key=lambda x: x[1][0]) return [("All Time", None)] + sorted_weeks def filter_data_by_week(predictions_df, week_range): """Filter predictions data by week range""" if predictions_df is None or predictions_df.empty or week_range is None: return predictions_df start_date, end_date = week_range # Filter data where open_to_bet_until falls within the week filtered_df = predictions_df[(predictions_df["open_to_bet_until"].dt.date >= start_date) & (predictions_df["open_to_bet_until"].dt.date <= end_date)] return filtered_df def create_leaderboard_df(predictions_df, week_filter=None): """ Create leaderboard DataFrame from predictions CSV data Much simpler than Future-Bench's complex JSON parsing """ if predictions_df is None or predictions_df.empty: return pd.DataFrame() # Apply week filter if specified if week_filter is not None: predictions_df = filter_data_by_week(predictions_df, week_filter) if predictions_df.empty: return pd.DataFrame() # Calculate accuracy by algorithm and event type results = [] # Group by algorithm to calculate metrics for algorithm in predictions_df["algorithm_name"].unique(): algo_data = predictions_df[predictions_df["algorithm_name"] == algorithm] # Filter out rows where result is null (unresolved events) resolved_data = algo_data[algo_data["result"].notna()] if len(resolved_data) == 0: continue # Calculate accuracy for each event type cleaned_algorithm = clean_model_name(algorithm) algo_scores = {"Model": make_clickable_model(cleaned_algorithm), "Events": len(resolved_data), "Correct Predictions": 0} task_scores = [] for task in Tasks: task_data = resolved_data[resolved_data["event_type"] == task.value.benchmark] if len(task_data) > 0: # Calculate accuracy for this task # Handle different prediction formats correct = 0 total = len(task_data) for _, row in task_data.iterrows(): prediction = row["actual_prediction"] actual = row["result"] # Simple string comparison for now # Could be enhanced for more complex prediction formats if str(prediction).lower().strip() == str(actual).lower().strip(): correct += 1 accuracy = (correct / total) * 100 if total > 0 else 0 algo_scores[task.value.col_name] = accuracy task_scores.append(accuracy) # Add to total correct predictions algo_scores["Correct Predictions"] += correct else: algo_scores[task.value.col_name] = None # Calculate average accuracy across tasks where model made predictions if task_scores: algo_scores["Average"] = sum(task_scores) / len(task_scores) else: algo_scores["Average"] = 0 results.append(algo_scores) # Create DataFrame df = pd.DataFrame(results) # Sort by average score (descending) if "Average" in df.columns: df = df.sort_values("Average", ascending=False) # Reset index to ensure proper row indexing df = df.reset_index(drop=True) # Add rank column with medals for top 3 and numbers for rest ranks = [] for i in range(len(df)): if i == 0: ranks.append("🥇") elif i == 1: ranks.append("🥈") elif i == 2: ranks.append("🥉") else: ranks.append(f"#{i + 1}") # Insert rank column at the beginning df.insert(0, "Rank", ranks) # Format percentage columns for task in Tasks: if task.value.col_name in df.columns: df[task.value.col_name] = df[task.value.col_name].apply(format_percentage) if "Average" in df.columns: df["Average"] = df["Average"].apply(format_percentage) return df def get_leaderboard_summary(df): """Get summary statistics for the leaderboard""" if df is None or df.empty: return {"total_models": 0, "total_predictions": 0, "avg_accuracy": 0} total_models = len(df) total_predictions = df["Events"].sum() if "Events" in df.columns else 0 # Calculate average accuracy across all models avg_accuracy = 0 if "Average" in df.columns: # Extract numeric values from percentage strings numeric_scores = [] for score in df["Average"]: if score != "N/A": try: numeric_scores.append(float(score.replace("%", ""))) except Exception: pass if numeric_scores: avg_accuracy = sum(numeric_scores) / len(numeric_scores) return {"total_models": total_models, "total_predictions": total_predictions, "avg_accuracy": avg_accuracy} def filter_leaderboard(df, min_predictions=0): """Filter leaderboard by minimum number of predictions""" if df is None or df.empty: return df if "Events" in df.columns: return df[df["Events"] >= min_predictions] return df