AlvaroMros's picture
Refactor imports to use absolute paths and clean up scripts
9678fdb
import os
import json
import argparse
import pandas as pd
from .scrape_fights import scrape_all_events, scrape_latest_events
from .scrape_fighters import scrape_all_fighters
from .to_csv import json_to_csv, fighters_json_to_csv
from .preprocess import preprocess_fighters_csv
from ..config import (
OUTPUT_DIR,
FIGHTERS_JSON_PATH,
EVENTS_JSON_PATH,
FIGHTS_CSV_PATH,
LAST_EVENT_JSON_PATH
)
def main():
"""
Main function to run the scraping and preprocessing pipeline.
Supports both full scraping and incremental updates.
"""
parser = argparse.ArgumentParser(description="UFC Data Scraping Pipeline")
parser.add_argument(
'--mode',
type=str,
default='full',
choices=['full', 'update'],
help="Scraping mode: 'full' (complete scraping) or 'update' (latest events + sync from last_event.json)"
)
parser.add_argument(
'--num-events',
type=int,
default=5,
help="Number of latest events to scrape in update mode (default: 5)"
)
args = parser.parse_args()
# Ensure the output directory exists
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f"Created directory: {OUTPUT_DIR}")
if args.mode == 'full':
run_full_pipeline()
elif args.mode == 'update':
run_update_pipeline(args.num_events)
def run_full_pipeline():
"""
Runs the complete scraping and preprocessing pipeline.
"""
print("\n=== Running FULL scraping pipeline ===")
# --- Step 1: Scrape all data from the website ---
# This will generate fighters.json and events.json
scrape_all_fighters(FIGHTERS_JSON_PATH)
scrape_all_events(EVENTS_JSON_PATH)
# --- Step 2: Convert the scraped JSON data to CSV format ---
# This will generate fighters.csv and fights.csv
json_to_csv(EVENTS_JSON_PATH, FIGHTS_CSV_PATH)
fighters_json_to_csv(FIGHTERS_JSON_PATH, FIGHTERS_CSV_PATH)
# --- Step 3: Run post-processing on the generated CSV files ---
# This cleans names, converts height, etc.
print("\n--- Running post-scraping preprocessing ---")
preprocess_fighters_csv()
# --- Step 4: Clean up temporary JSON files ---
print("\n--- Deleting temporary JSON files ---")
try:
if os.path.exists(EVENTS_JSON_PATH):
os.remove(EVENTS_JSON_PATH)
print(f"Deleted: {EVENTS_JSON_PATH}")
if os.path.exists(FIGHTERS_JSON_PATH):
os.remove(FIGHTERS_JSON_PATH)
print(f"Deleted: {FIGHTERS_JSON_PATH}")
except OSError as e:
print(f"Error deleting JSON files: {e}")
print("\n\n--- Full Scraping and Preprocessing Pipeline Finished ---")
def run_update_pipeline(num_events=5):
"""
Runs the incremental update pipeline to scrape only the latest events.
Also adds any events from last_event.json that aren't already in the CSV.
Args:
num_events (int): Number of latest events to scrape
"""
print(f"\n=== Running UPDATE pipeline for latest {num_events} events ===")
# --- Step 1: Scrape latest events only ---
latest_events = scrape_latest_events(LAST_EVENT_JSON_PATH, num_events)
# --- Step 2: Save latest events to last_event.json (even if empty) ---
if latest_events:
with open(LAST_EVENT_JSON_PATH, 'w') as f:
json.dump(latest_events, f, indent=4)
print(f"Latest {len(latest_events)} events saved to {LAST_EVENT_JSON_PATH}")
# --- Step 3: Always check and update from last_event.json ---
update_fights_csv_from_last_event()
print(f"\n--- Update Pipeline Finished ---")
def update_fights_csv_from_last_event():
"""
Updates the existing fights CSV with any events from last_event.json that aren't already present.
Ensures latest events are on top and preserves data types.
"""
# Check if last_event.json exists
if not os.path.exists(LAST_EVENT_JSON_PATH):
print(f"No {LAST_EVENT_JSON_PATH} found. Nothing to update.")
return
# Load events from last_event.json
try:
with open(LAST_EVENT_JSON_PATH, 'r') as f:
events_from_json = json.load(f)
if not events_from_json:
print("No events found in last_event.json.")
return
print(f"Found {len(events_from_json)} events in last_event.json")
except Exception as e:
print(f"Error reading last_event.json: {e}")
return
try:
# Check if main CSV exists
if os.path.exists(FIGHTS_CSV_PATH):
existing_df = pd.read_csv(FIGHTS_CSV_PATH)
existing_event_names = set(existing_df['event_name'].unique())
else:
print(f"Main fights CSV ({FIGHTS_CSV_PATH}) not found. Creating new CSV from last_event.json.")
json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
return
# Create temporary CSV from events in last_event.json
temp_json_path = os.path.join(OUTPUT_DIR, 'temp_latest.json')
temp_csv_path = os.path.join(OUTPUT_DIR, 'temp_latest.csv')
with open(temp_json_path, 'w') as f:
json.dump(events_from_json, f, indent=4)
json_to_csv(temp_json_path, temp_csv_path)
# Read the new CSV
new_df = pd.read_csv(temp_csv_path)
# Filter out events that already exist
new_events_df = new_df[~new_df['event_name'].isin(existing_event_names)]
if len(new_events_df) > 0:
# Add new events to the TOP of the CSV (latest first)
combined_df = pd.concat([new_events_df, existing_df], ignore_index=True)
# Convert date column to datetime for proper sorting
combined_df['event_date_parsed'] = pd.to_datetime(combined_df['event_date'])
# Sort by date descending (latest first)
combined_df = combined_df.sort_values('event_date_parsed', ascending=False)
# Drop the temporary date column
combined_df = combined_df.drop('event_date_parsed', axis=1)
# Fix data types to remove .0 from numbers
fix_data_types(combined_df)
combined_df.to_csv(FIGHTS_CSV_PATH, index=False)
print(f"Added {len(new_events_df)} new fights from {new_events_df['event_name'].nunique()} events to the TOP of {FIGHTS_CSV_PATH}")
else:
print("No new events found that aren't already in the existing CSV.")
# Clean up temporary files
if os.path.exists(temp_json_path):
os.remove(temp_json_path)
if os.path.exists(temp_csv_path):
os.remove(temp_csv_path)
except Exception as e:
print(f"Error updating fights CSV: {e}")
print("Falling back to creating new CSV from last_event.json only.")
json_to_csv(LAST_EVENT_JSON_PATH, FIGHTS_CSV_PATH)
def fix_data_types(df):
"""
Fix data types in the dataframe to remove .0 from numbers and preserve original format.
Args:
df (pandas.DataFrame): DataFrame to fix
"""
for col in df.columns:
if df[col].dtype == 'float64':
# Check if the column contains only whole numbers (no actual decimals)
if df[col].notna().all() and (df[col] % 1 == 0).all():
df[col] = df[col].astype('int64')
elif df[col].isna().any():
# Handle columns with missing values - keep as string to avoid .0
df[col] = df[col].fillna('').astype(str)
# Remove .0 from string representations
df[col] = df[col].str.replace(r'\.0$', '', regex=True)
# Convert empty strings back to original empty values
df[col] = df[col].replace('', '')