import os import json import argparse import pandas as pd from .scrape_fights import scrape_all_events, scrape_latest_events from .scrape_fighters import scrape_all_fighters from .to_csv import json_to_csv, fighters_json_to_csv from .preprocess import preprocess_fighters_csv from ..config import ( OUTPUT_DIR, FIGHTERS_JSON_PATH, EVENTS_JSON_PATH, FIGHTS_CSV_PATH, LAST_EVENT_JSON_PATH ) def main(): """ Main function to run the scraping and preprocessing pipeline. Supports both full scraping and incremental updates. """ parser = argparse.ArgumentParser(description="UFC Data Scraping Pipeline") parser.add_argument( '--mode', type=str, default='full', choices=['full', 'update'], help="Scraping mode: 'full' (complete scraping) or 'update' (latest events + sync from last_event.json)" ) parser.add_argument( '--num-events', type=int, default=5, help="Number of latest events to scrape in update mode (default: 5)" ) args = parser.parse_args() # Ensure the output directory exists if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) print(f"Created directory: {OUTPUT_DIR}") if args.mode == 'full': run_full_pipeline() elif args.mode == 'update': run_update_pipeline(args.num_events) def run_full_pipeline(): """ Runs the complete scraping and preprocessing pipeline. """ print("\n=== Running FULL scraping pipeline ===") # --- Step 1: Scrape all data from the website --- # This will generate fighters.json and events.json scrape_all_fighters(FIGHTERS_JSON_PATH) scrape_all_events(EVENTS_JSON_PATH) # --- Step 2: Convert the scraped JSON data to CSV format --- # This will generate fighters.csv and fights.csv json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH) fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH) # --- Step 3: Run post-processing on the generated CSV files --- # This cleans names, converts height, etc. print("\n--- Running post-scraping preprocessing ---") preprocess_fighters_csv() # --- Step 4: Clean up temporary JSON files --- print("\n--- Deleting temporary JSON files ---") try: if os.path.exists(EVENTS_JSON_PATH): os.remove(EVENTS_JSON_PATH) print(f"Deleted: {EVENTS_JSON_PATH}") if os.path.exists(FIGHTERS_JSON_PATH): os.remove(FIGHTERS_JSON_PATH) print(f"Deleted: {FIGHTERS_JSON_PATH}") except OSError as e: print(f"Error deleting JSON files: {e}") print("\n\n--- Full Scraping and Preprocessing Pipeline Finished ---") def run_update_pipeline(num_events=5): """ Runs the incremental update pipeline to scrape only the latest events. Also adds any events from last_event.json that aren't already in the CSV. Args: num_events (int): Number of latest events to scrape """ print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===") # --- Step 1: Scrape latest events only --- latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events) # --- Step 2: Save latest events to last_event.json (even if empty) --- if latest_events: with open(LAST_EVENT_JSON_PATH, 'w') as f: json.dump(latest_events, f, indent=4) print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}") # --- Step 3: Always check and update from last_event.json --- update_fights_csv_from_last_event() print(f"\n--- Update Pipeline Finished ---") def update_fights_csv_from_last_event(): """ Updates the existing fights CSV with any events from last_event.json that aren't already present. Ensures latest events are on top and preserves data types. """ # Check if last_event.json exists if not os.path.exists(LAST_EVENT_JSON_PATH): print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.") return # Load events from last_event.json try: with open(LAST_EVENT_JSON_PATH, 'r') as f: events_from_json = json.load(f) if not events_from_json: print("No events found in last_event.json.") return print(f"Found {len(events_from_json)} events in last_event.json") except Exception as e: print(f"Error reading last_event.json: {e}") return try: # Check if main CSV exists if os.path.exists(FIGHTS_CSV_PATH): existing_df = pd.read_csv(FIGHTS_CSV_PATH) existing_event_names = set(existing_df['event_name'].unique()) else: print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.") json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH) return # Create temporary CSV from events in last_event.json temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json') temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv') with open(temp_json_path, 'w') as f: json.dump(events_from_json, f, indent=4) json_to_csv(temp_json_path, temp_csv_path) # Read the new CSV new_df = pd.read_csv(temp_csv_path) # Filter out events that already exist new_events_df = new_df[~new_df['event_name'].isin(existing_event_names)] if len(new_events_df) > 0: # Add new events to the TOP of the CSV (latest first) combined_df = pd.concat([new_events_df, existing_df], ignore_index=True) # Convert date column to datetime for proper sorting combined_df['event_date_parsed'] = pd.to_datetime(combined_df['event_date']) # Sort by date descending (latest first) combined_df = combined_df.sort_values('event_date_parsed', ascending=False) # Drop the temporary date column combined_df = combined_df.drop('event_date_parsed', axis=1) # Fix data types to remove .0 from numbers fix_data_types(combined_df) combined_df.to_csv(FIGHTS_CSV_PATH, index=False) print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}") else: print("No new events found that aren't already in the existing CSV.") # Clean up temporary files if os.path.exists(temp_json_path): os.remove(temp_json_path) if os.path.exists(temp_csv_path): os.remove(temp_csv_path) except Exception as e: print(f"Error updating fights CSV: {e}") print("Falling back to creating new CSV from last_event.json only.") json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH) def fix_data_types(df): """ Fix data types in the dataframe to remove .0 from numbers and preserve original format. Args: df (pandas.DataFrame): DataFrame to fix """ for col in df.columns: if df[col].dtype == 'float64': # Check if the column contains only whole numbers (no actual decimals) if df[col].notna().all() and (df[col] % 1 == 0).all(): df[col] = df[col].astype('int64') elif df[col].isna().any(): # Handle columns with missing values - keep as string to avoid .0 df[col] = df[col].fillna('').astype(str) # Remove .0 from string representations df[col] = df[col].str.replace(r'\.0$', '', regex=True) # Convert empty strings back to original empty values df[col] = df[col].replace('', '')