""" Gokul Ramanan runs_api.py 8/6/2025 Description: RUNS API for extracting data from the Top_85_International_Run_Scorers.csv file based on various selection parameters. """ import pandas as pd class RUNSAPI: def __init__(self): self.runs = None def load_runs(self, filename): """ Load and preprocess the dataset from a CSV file. Parameters: filename (str): Path to the CSV file containing run statistics. Processing Steps: - Loads data into a pandas DataFrame. - Converts the 'Runs' column to numeric, coercing non-numeric entries to NaN. - Sorts the DataFrame by player name, format, and year for accurate cumulative calculations. Returns: None: Modifies the `self.runs` attribute in-place. """ self.runs = pd.read_csv(filename) # Clean: Ensure 'Runs' is numeric self.runs["Runs"] = pd.to_numeric(self.runs["Runs"], errors="coerce") self.runs["Innings"] = pd.to_numeric(self.runs["Inns"], errors="coerce") self.runs["Not Outs"] = pd.to_numeric(self.runs["NO"], errors="coerce") self.runs["Balls Faced"] = pd.to_numeric(self.runs["BF"], errors="coerce") self.runs["Matches"] = pd.to_numeric(self.runs["Mat"], errors="coerce") self.runs["100s"] = pd.to_numeric(self.runs["100s"], errors="coerce") self.runs["50s"] = pd.to_numeric(self.runs["50s"], errors="coerce") # Clean: Sort for proper cumulative computation self.runs = self.runs.sort_values(by=["Name", "Format", "Year"]) def apply_filters(self, formats=None, countries=None, year_range=None, top_n_players = None, player_select_value = None, ranking_metric="Runs", career_length_slider = None, only_all_formats=False, not_all_formats=False): """ Filter the dataset based on selected formats, countries, year range, and top N players by run total. Parameters: formats (list): List of formats to filter (e.g., ['odi', 'test']). countries (list): List of country names to include. year_range (list): [start_year, end_year] for filtering. top_n_players (int): Keep only top N run-scorers across filtered data. Returns: pd.DataFrame: Filtered and aggregated DataFrame with cumulative runs. """ df = self.runs.copy() if formats: df = df[df["Format"].isin(formats)] if countries: df = df[df["Country"].isin(countries)] if year_range: start_year, end_year = year_range df = df[(df["Year"] >= start_year) & (df["Year"] <= end_year)] if career_length_slider: min_years, max_years = career_length_slider career_lengths = df.groupby("Name")["Year"].nunique() valid_players = career_lengths[(career_lengths >= min_years) & (career_lengths <= max_years)].index df = df[df["Name"].isin(valid_players)] if only_all_formats: player_format_counts = self.runs.groupby("Name")["Format"].nunique() eligible_players = player_format_counts[player_format_counts == 3].index df = df[df["Name"].isin(eligible_players)] if not_all_formats: player_format_counts = self.runs.groupby("Name")["Format"].nunique() eligible_players = player_format_counts[player_format_counts < 3].index df = df[df["Name"].isin(eligible_players)] df = df.groupby(["Name", "Year", "Country"], as_index=False).agg({"Runs": "sum", "Innings": "sum", "Not Outs": "sum", "Balls Faced": "sum", "Matches": "sum", "100s": "sum", "50s": "sum"}) df = df.sort_values(by=["Name", "Year"]) df["cumulative_matches"] = df.groupby("Name")["Matches"].cumsum() df["cumulative_100s"] = df.groupby("Name")["100s"].cumsum() df["cumulative_50s"] = df.groupby("Name")["50s"].cumsum() df["cumulative_format_runs"] = df.groupby("Name")["Runs"].cumsum() df["cumulative_innings"] = df.groupby("Name")["Innings"].cumsum() df["cumulative_NO"] = df.groupby("Name")["Not Outs"].cumsum() df["cumulative_format_BF"] = df.groupby("Name")["Balls Faced"].cumsum() df["cumulative_format_average"] = df["cumulative_format_runs"] / (df["cumulative_innings"] - df["cumulative_NO"]) df["cumulative_SR"] = df.apply( lambda row: (row["cumulative_format_runs"] / row["cumulative_format_BF"]) * 100 if pd.notnull(row["cumulative_format_BF"]) and row["cumulative_format_BF"] > 0 else None, axis=1 ) df["Is_Prediction"] = df["Year"] == 2030 df = df.sort_values(by=["Name", "Year"]) df["Career Year"] = df.groupby("Name").cumcount() df.loc[df["Is_Prediction"], "Career Year"] = df.groupby("Name")["Career Year"].transform("max") + 5 if player_select_value: df = df[df["Name"].isin(player_select_value)] elif top_n_players: if ranking_metric == "Runs": metric_series = df.groupby("Name")["Runs"].sum() elif ranking_metric == "Average": # Use final cumulative average for each player (latest year) last_year_df = df.sort_values(by=["Year"]).groupby("Name").tail(1) metric_series = last_year_df.set_index("Name")["cumulative_format_average"] else: raise ValueError(f"Unknown ranking_metric: {ranking_metric}") top_players = metric_series.sort_values(ascending=False).head(top_n_players).index df = df[df["Name"].isin(top_players)] # Calculate debut year for each player debut_years = df.groupby("Name")["Year"].min() df["Debut Year"] = df["Name"].map(debut_years) # Bin debut years into 5-year ranges def bin_debut_year(year): if pd.isnull(year): return "Unknown" start = int((year // 5) * 5) return f"{start}–{start + 4}" df["Debut Bin"] = df["Debut Year"].apply(bin_debut_year) return df.reset_index() def main(): high_international_runs = RUNSAPI() high_international_runs.load_runs('Top_85_International_Run_Scorers.csv') filtered_df = high_international_runs.apply_filters() print(len(filtered_df["Name"].unique())) if __name__ == "__main__": main()