Spaces:
Running
Running
File size: 6,473 Bytes
6441bc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
from datetime import timedelta
import pandas as pd
from .about import Tasks
from .display_utils import format_percentage, make_clickable_model
def clean_model_name(model_name: str) -> str:
"""Clean up model names for better display"""
if model_name.startswith("smolagents-tavily-web-visit-"):
return "Agent Baseline " + model_name.removeprefix("smolagents-tavily-web-visit-")
if model_name.startswith("language-model-"):
return "Language Model " + model_name.removeprefix("language-model-")
return model_name
def get_available_weeks(predictions_df):
"""Get list of available weeks from the data"""
if predictions_df is None or predictions_df.empty:
return []
# Get unique dates and convert to weeks
dates = predictions_df["open_to_bet_until"].dt.date.unique()
weeks = {}
for date in dates:
# Get the Monday of the week for this date
monday = date - timedelta(days=date.weekday())
week_end = monday + timedelta(days=6)
week_key = f"{monday} to {week_end}"
week_range = (monday, week_end)
weeks[week_key] = week_range
# Sort by date
sorted_weeks = sorted(weeks.items(), key=lambda x: x[1][0])
return [("All Time", None)] + sorted_weeks
def filter_data_by_week(predictions_df, week_range):
"""Filter predictions data by week range"""
if predictions_df is None or predictions_df.empty or week_range is None:
return predictions_df
start_date, end_date = week_range
# Filter data where open_to_bet_until falls within the week
filtered_df = predictions_df[(predictions_df["open_to_bet_until"].dt.date >= start_date) & (predictions_df["open_to_bet_until"].dt.date <= end_date)]
return filtered_df
def create_leaderboard_df(predictions_df, week_filter=None):
"""
Create leaderboard DataFrame from predictions CSV data
Much simpler than Future-Bench's complex JSON parsing
"""
if predictions_df is None or predictions_df.empty:
return pd.DataFrame()
# Apply week filter if specified
if week_filter is not None:
predictions_df = filter_data_by_week(predictions_df, week_filter)
if predictions_df.empty:
return pd.DataFrame()
# Calculate accuracy by algorithm and event type
results = []
# Group by algorithm to calculate metrics
for algorithm in predictions_df["algorithm_name"].unique():
algo_data = predictions_df[predictions_df["algorithm_name"] == algorithm]
# Filter out rows where result is null (unresolved events)
resolved_data = algo_data[algo_data["result"].notna()]
if len(resolved_data) == 0:
continue
# Calculate accuracy for each event type
cleaned_algorithm = clean_model_name(algorithm)
algo_scores = {"Model": make_clickable_model(cleaned_algorithm), "Events": len(resolved_data), "Correct Predictions": 0}
task_scores = []
for task in Tasks:
task_data = resolved_data[resolved_data["event_type"] == task.value.benchmark]
if len(task_data) > 0:
# Calculate accuracy for this task
# Handle different prediction formats
correct = 0
total = len(task_data)
for _, row in task_data.iterrows():
prediction = row["actual_prediction"]
actual = row["result"]
# Simple string comparison for now
# Could be enhanced for more complex prediction formats
if str(prediction).lower().strip() == str(actual).lower().strip():
correct += 1
accuracy = (correct / total) * 100 if total > 0 else 0
algo_scores[task.value.col_name] = accuracy
task_scores.append(accuracy)
# Add to total correct predictions
algo_scores["Correct Predictions"] += correct
else:
algo_scores[task.value.col_name] = None
# Calculate average accuracy across tasks where model made predictions
if task_scores:
algo_scores["Average"] = sum(task_scores) / len(task_scores)
else:
algo_scores["Average"] = 0
results.append(algo_scores)
# Create DataFrame
df = pd.DataFrame(results)
# Sort by average score (descending)
if "Average" in df.columns:
df = df.sort_values("Average", ascending=False)
# Reset index to ensure proper row indexing
df = df.reset_index(drop=True)
# Add rank column with medals for top 3 and numbers for rest
ranks = []
for i in range(len(df)):
if i == 0:
ranks.append("🥇")
elif i == 1:
ranks.append("🥈")
elif i == 2:
ranks.append("🥉")
else:
ranks.append(f"#{i + 1}")
# Insert rank column at the beginning
df.insert(0, "Rank", ranks)
# Format percentage columns
for task in Tasks:
if task.value.col_name in df.columns:
df[task.value.col_name] = df[task.value.col_name].apply(format_percentage)
if "Average" in df.columns:
df["Average"] = df["Average"].apply(format_percentage)
return df
def get_leaderboard_summary(df):
"""Get summary statistics for the leaderboard"""
if df is None or df.empty:
return {"total_models": 0, "total_predictions": 0, "avg_accuracy": 0}
total_models = len(df)
total_predictions = df["Events"].sum() if "Events" in df.columns else 0
# Calculate average accuracy across all models
avg_accuracy = 0
if "Average" in df.columns:
# Extract numeric values from percentage strings
numeric_scores = []
for score in df["Average"]:
if score != "N/A":
try:
numeric_scores.append(float(score.replace("%", "")))
except Exception:
pass
if numeric_scores:
avg_accuracy = sum(numeric_scores) / len(numeric_scores)
return {"total_models": total_models, "total_predictions": total_predictions, "avg_accuracy": avg_accuracy}
def filter_leaderboard(df, min_predictions=0):
"""Filter leaderboard by minimum number of predictions"""
if df is None or df.empty:
return df
if "Events" in df.columns:
return df[df["Events"] >= min_predictions]
return df
|