File size: 6,896 Bytes
7e0d9e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
"""
Gokul Ramanan
runs_api.py
8/6/2025
Description: RUNS API for extracting data from the Top_85_International_Run_Scorers.csv
file based on various selection parameters.
"""
import pandas as pd
class RUNSAPI:
def __init__(self):
self.runs = None
def load_runs(self, filename):
"""
Load and preprocess the dataset from a CSV file.
Parameters:
filename (str): Path to the CSV file containing run statistics.
Processing Steps:
- Loads data into a pandas DataFrame.
- Converts the 'Runs' column to numeric, coercing non-numeric
entries to NaN.
- Sorts the DataFrame by player name, format, and year for
accurate cumulative calculations.
Returns:
None: Modifies the `self.runs` attribute in-place.
"""
self.runs = pd.read_csv(filename)
# Clean: Ensure 'Runs' is numeric
self.runs["Runs"] = pd.to_numeric(self.runs["Runs"], errors="coerce")
self.runs["Innings"] = pd.to_numeric(self.runs["Inns"], errors="coerce")
self.runs["Not Outs"] = pd.to_numeric(self.runs["NO"], errors="coerce")
self.runs["Balls Faced"] = pd.to_numeric(self.runs["BF"], errors="coerce")
self.runs["Matches"] = pd.to_numeric(self.runs["Mat"], errors="coerce")
self.runs["100s"] = pd.to_numeric(self.runs["100s"], errors="coerce")
self.runs["50s"] = pd.to_numeric(self.runs["50s"], errors="coerce")
# Clean: Sort for proper cumulative computation
self.runs = self.runs.sort_values(by=["Name", "Format", "Year"])
def apply_filters(self, formats=None, countries=None, year_range=None, top_n_players = None, player_select_value = None,
ranking_metric="Runs", career_length_slider = None, only_all_formats=False, not_all_formats=False):
"""
Filter the dataset based on selected formats, countries, year range, and
top N players by run total.
Parameters:
formats (list): List of formats to filter (e.g., ['odi', 'test']).
countries (list): List of country names to include.
year_range (list): [start_year, end_year] for filtering.
top_n_players (int): Keep only top N run-scorers across filtered data.
Returns:
pd.DataFrame: Filtered and aggregated DataFrame with cumulative runs.
"""
df = self.runs.copy()
if formats:
df = df[df["Format"].isin(formats)]
if countries:
df = df[df["Country"].isin(countries)]
if year_range:
start_year, end_year = year_range
df = df[(df["Year"] >= start_year) & (df["Year"] <= end_year)]
if career_length_slider:
min_years, max_years = career_length_slider
career_lengths = df.groupby("Name")["Year"].nunique()
valid_players = career_lengths[(career_lengths >= min_years) & (career_lengths <= max_years)].index
df = df[df["Name"].isin(valid_players)]
if only_all_formats:
player_format_counts = self.runs.groupby("Name")["Format"].nunique()
eligible_players = player_format_counts[player_format_counts == 3].index
df = df[df["Name"].isin(eligible_players)]
if not_all_formats:
player_format_counts = self.runs.groupby("Name")["Format"].nunique()
eligible_players = player_format_counts[player_format_counts < 3].index
df = df[df["Name"].isin(eligible_players)]
df = df.groupby(["Name", "Year", "Country"], as_index=False).agg({"Runs": "sum", "Innings": "sum",
"Not Outs": "sum", "Balls Faced": "sum",
"Matches": "sum", "100s": "sum", "50s": "sum"})
df = df.sort_values(by=["Name", "Year"])
df["cumulative_matches"] = df.groupby("Name")["Matches"].cumsum()
df["cumulative_100s"] = df.groupby("Name")["100s"].cumsum()
df["cumulative_50s"] = df.groupby("Name")["50s"].cumsum()
df["cumulative_format_runs"] = df.groupby("Name")["Runs"].cumsum()
df["cumulative_innings"] = df.groupby("Name")["Innings"].cumsum()
df["cumulative_NO"] = df.groupby("Name")["Not Outs"].cumsum()
df["cumulative_format_BF"] = df.groupby("Name")["Balls Faced"].cumsum()
df["cumulative_format_average"] = df["cumulative_format_runs"] / (df["cumulative_innings"] - df["cumulative_NO"])
df["cumulative_SR"] = df.apply(
lambda row: (row["cumulative_format_runs"] / row["cumulative_format_BF"]) * 100
if pd.notnull(row["cumulative_format_BF"]) and row["cumulative_format_BF"] > 0
else None,
axis=1
)
df["Is_Prediction"] = df["Year"] == 2030
df = df.sort_values(by=["Name", "Year"])
df["Career Year"] = df.groupby("Name").cumcount()
df.loc[df["Is_Prediction"], "Career Year"] = df.groupby("Name")["Career Year"].transform("max") + 5
if player_select_value:
df = df[df["Name"].isin(player_select_value)]
elif top_n_players:
if ranking_metric == "Runs":
metric_series = df.groupby("Name")["Runs"].sum()
elif ranking_metric == "Average":
# Use final cumulative average for each player (latest year)
last_year_df = df.sort_values(by=["Year"]).groupby("Name").tail(1)
metric_series = last_year_df.set_index("Name")["cumulative_format_average"]
else:
raise ValueError(f"Unknown ranking_metric: {ranking_metric}")
top_players = metric_series.sort_values(ascending=False).head(top_n_players).index
df = df[df["Name"].isin(top_players)]
# Calculate debut year for each player
debut_years = df.groupby("Name")["Year"].min()
df["Debut Year"] = df["Name"].map(debut_years)
# Bin debut years into 5-year ranges
def bin_debut_year(year):
if pd.isnull(year):
return "Unknown"
start = int((year // 5) * 5)
return f"{start}–{start + 4}"
df["Debut Bin"] = df["Debut Year"].apply(bin_debut_year)
return df.reset_index()
def main():
high_international_runs = RUNSAPI()
high_international_runs.load_runs('Top_85_International_Run_Scorers.csv')
filtered_df = high_international_runs.apply_filters()
print(len(filtered_df["Name"].unique()))
if __name__ == "__main__":
main() |