import requests from bs4 import BeautifulSoup import json import time import concurrent.futures from ..config import EVENTS_JSON_PATH # --- Configuration --- # The number of parallel threads to use for scraping fight details. # Increase this to scrape faster, but be mindful of rate limits. MAX_WORKERS = 10 # The delay in seconds between each request to a fight's detail page. # This is a politeness measure to avoid overwhelming the server. REQUEST_DELAY = 0.1 # --- End Configuration --- BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all" def get_soup(url): response = requests.get(url) response.raise_for_status() # Raise an exception for bad status codes return BeautifulSoup(response.text, 'html.parser') def scrape_fight_details(fight_url): print(f" Scraping fight: {fight_url}") soup = get_soup(fight_url) # On upcoming fight pages, there's a specific div. If it exists, skip. if soup.find('div', class_='b-fight-details__content-abbreviated'): print(f" Upcoming fight, no details available: {fight_url}") return None tables = soup.find_all('table', class_='b-fight-details__table') if not tables: print(f" No stats tables found on {fight_url}") return None fight_details = {"fighter_1_stats": {}, "fighter_2_stats": {}} # Helper to extract stats. The stats for both fighters are in

tags within a single def extract_stats_from_cell(cell, col_name): ps = cell.find_all('p') if len(ps) == 2: fight_details["fighter_1_stats"][col_name] = ps[0].text.strip() fight_details["fighter_2_stats"][col_name] = ps[1].text.strip() # --- Totals Table --- # The first table contains overall stats totals_table = tables[0] totals_tbody = totals_table.find('tbody') if totals_tbody: totals_row = totals_tbody.find('tr') if totals_row: totals_cols = totals_row.find_all('td') stat_cols = { 1: 'kd', 2: 'sig_str', 3: 'sig_str_percent', 4: 'total_str', 5: 'td', 6: 'td_percent', 7: 'sub_att', 8: 'rev', 9: 'ctrl' } for index, name in stat_cols.items(): if index < len(totals_cols): extract_stats_from_cell(totals_cols[index], name) # --- Significant Strikes Table --- # The second table contains significant strike details if len(tables) > 1: sig_strikes_table = tables[1] sig_strikes_tbody = sig_strikes_table.find('tbody') if sig_strikes_tbody: sig_strikes_row = sig_strikes_tbody.find('tr') if sig_strikes_row: sig_strikes_cols = sig_strikes_row.find_all('td') stat_cols = { 2: 'sig_str_head', 3: 'sig_str_body', 4: 'sig_str_leg', 5: 'sig_str_distance', 6: 'sig_str_clinch', 7: 'sig_str_ground' } for index, name in stat_cols.items(): if index < len(sig_strikes_cols): extract_stats_from_cell(sig_strikes_cols[index], name) return fight_details def fetch_fight_details_worker(fight_url): """ Worker function for the thread pool. Scrapes details for a single fight and applies a delay to be polite to the server. """ try: details = scrape_fight_details(fight_url) time.sleep(REQUEST_DELAY) return details except Exception as e: print(f" Could not scrape fight details for {fight_url}: {e}") time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe return None def scrape_event_details(event_url): print(f"Scraping event: {event_url}") soup = get_soup(event_url) event_details = {} # Extract event name event_details['name'] = soup.find('h2', class_='b-content__title').text.strip() # Extract event date and location info_list = soup.find('ul', class_='b-list__box-list') list_items = info_list.find_all('li', class_='b-list__box-list-item') event_details['date'] = list_items[0].text.split(':')[1].strip() event_details['location'] = list_items[1].text.split(':')[1].strip() # Step 1: Gather base info and URLs for all fights on the event page. fights_to_process = [] fight_table = soup.find('table', class_='b-fight-details__table') if fight_table: rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row') for row in rows: cols = row.find_all('td', class_='b-fight-details__table-col') fighter1 = cols[1].find_all('p')[0].text.strip() fighter2 = cols[1].find_all('p')[1].text.strip() # Determine the winner from the W/L column based on the example provided. winner = None result_ps = cols[0].find_all('p') # This logic handles the structure seen in the example file. if len(result_ps) == 1: result_text = result_ps[0].text.strip().lower() if 'win' in result_text: # When one 'win' is present, it corresponds to the first fighter listed. winner = fighter1 elif 'draw' in result_text: winner = "Draw" elif 'nc' in result_text: winner = "NC" # This is a defensive case in case the structure has two

tags. elif len(result_ps) == 2: if 'win' in result_ps[0].text.strip().lower(): winner = fighter1 elif 'win' in result_ps[1].text.strip().lower(): winner = fighter2 elif 'draw' in result_ps[0].text.strip().lower(): winner = "Draw" elif 'nc' in result_ps[0].text.strip().lower(): winner = "NC" fight = { 'fighter_1': fighter1, 'fighter_2': fighter2, 'winner': winner, 'weight_class': cols[6].text.strip(), 'method': ' '.join(cols[7].stripped_strings), 'round': cols[8].text.strip(), 'time': cols[9].text.strip(), 'url': row['data-link'] } fights_to_process.append(fight) # Step 2: Scrape the details for all fights in parallel. fight_urls = [fight['url'] for fight in fights_to_process] completed_fights = [] if fight_urls: with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: # The map function maintains the order of results. fight_details_list = executor.map(fetch_fight_details_worker, fight_urls) for i, details in enumerate(fight_details_list): fight_data = fights_to_process[i] del fight_data['url'] # Clean up the temporary URL fight_data['details'] = details if details else None completed_fights.append(fight_data) event_details['fights'] = completed_fights return event_details def scrape_all_events(json_path): soup = get_soup(BASE_URL) events = [] table = soup.find('table', class_='b-statistics__table-events') if not table: print("Could not find events table on the page.") return [] event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')] total_events = len(event_rows) print(f"Found {total_events} events to scrape.") for i, row in enumerate(event_rows): event_link_tag = row.find('a', class_='b-link b-link_style_black') if not event_link_tag or not event_link_tag.has_attr('href'): continue event_url = event_link_tag['href'] try: event_data = scrape_event_details(event_url) if event_data: events.append(event_data) print(f"Progress: {i+1}/{total_events} events scraped.") if (i + 1) % 10 == 0: print(f"--- Saving progress: {i + 1} of {total_events} events saved. ---") with open(json_path, 'w') as f: json.dump(events, f, indent=4) except Exception as e: print(f"Could not process event {event_url}. Error: {e}") return events def scrape_latest_events(json_path, num_events=5): """ Scrapes only the latest N events from UFC stats. This is useful for incremental updates to avoid re-scraping all data. Args: json_path (str): Path to save the latest events JSON file num_events (int): Number of latest events to scrape (default: 5) Returns: list: List of scraped event data """ soup = get_soup(BASE_URL) events = [] table = soup.find('table', class_='b-statistics__table-events') if not table: print("Could not find events table on the page.") return [] event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')] # Limit to the latest N events (events are ordered chronologically with most recent first) latest_event_rows = event_rows[:num_events] total_events = len(latest_event_rows) print(f"Found {len(event_rows)} total events. Scraping latest {total_events} events.") for i, row in enumerate(latest_event_rows): event_link_tag = row.find('a', class_='b-link b-link_style_black') if not event_link_tag or not event_link_tag.has_attr('href'): continue event_url = event_link_tag['href'] try: event_data = scrape_event_details(event_url) if event_data: events.append(event_data) print(f"Progress: {i+1}/{total_events} latest events scraped.") except Exception as e: print(f"Could not process event {event_url}. Error: {e}") return events