import requests from bs4 import BeautifulSoup import json import time import concurrent.futures from ..config import EVENTS_JSON_PATH # --- Configuration --- # The number of parallel threads to use for scraping fight details. # Increase this to scrape faster, but be mindful of rate limits. MAX_WORKERS = 10 # The delay in seconds between each request to a fight's detail page. # This is a politeness measure to avoid overwhelming the server. REQUEST_DELAY = 0.1 # --- End Configuration --- BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all" def get_soup(url): response = requests.get(url) response.raise_for_status() # Raise an exception for bad status codes return BeautifulSoup(response.text, 'html.parser') def scrape_fight_details(fight_url): print(f" Scraping fight: {fight_url}") soup = get_soup(fight_url) # On upcoming fight pages, there's a specific div. If it exists, skip. if soup.find('div', class_='b-fight-details__content-abbreviated'): print(f" Upcoming fight, no details available: {fight_url}") return None tables = soup.find_all('table', class_='b-fight-details__table') if not tables: print(f" No stats tables found on {fight_url}") return None fight_details = {"fighter_1_stats": {}, "fighter_2_stats": {}} # Helper to extract stats. The stats for both fighters are in
tags within a single
tags. elif len(result_ps) == 2: if 'win' in result_ps[0].text.strip().lower(): winner = fighter1 elif 'win' in result_ps[1].text.strip().lower(): winner = fighter2 elif 'draw' in result_ps[0].text.strip().lower(): winner = "Draw" elif 'nc' in result_ps[0].text.strip().lower(): winner = "NC" fight = { 'fighter_1': fighter1, 'fighter_2': fighter2, 'winner': winner, 'weight_class': cols[6].text.strip(), 'method': ' '.join(cols[7].stripped_strings), 'round': cols[8].text.strip(), 'time': cols[9].text.strip(), 'url': row['data-link'] } fights_to_process.append(fight) # Step 2: Scrape the details for all fights in parallel. fight_urls = [fight['url'] for fight in fights_to_process] completed_fights = [] if fight_urls: with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: # The map function maintains the order of results. fight_details_list = executor.map(fetch_fight_details_worker, fight_urls) for i, details in enumerate(fight_details_list): fight_data = fights_to_process[i] del fight_data['url'] # Clean up the temporary URL fight_data['details'] = details if details else None completed_fights.append(fight_data) event_details['fights'] = completed_fights return event_details def scrape_all_events(json_path): soup = get_soup(BASE_URL) events = [] table = soup.find('table', class_='b-statistics__table-events') if not table: print("Could not find events table on the page.") return [] event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')] total_events = len(event_rows) print(f"Found {total_events} events to scrape.") for i, row in enumerate(event_rows): event_link_tag = row.find('a', class_='b-link b-link_style_black') if not event_link_tag or not event_link_tag.has_attr('href'): continue event_url = event_link_tag['href'] try: event_data = scrape_event_details(event_url) if event_data: events.append(event_data) print(f"Progress: {i+1}/{total_events} events scraped.") if (i + 1) % 10 == 0: print(f"--- Saving progress: {i + 1} of {total_events} events saved. ---") with open(json_path, 'w') as f: json.dump(events, f, indent=4) except Exception as e: print(f"Could not process event {event_url}. Error: {e}") return events def scrape_latest_events(json_path, num_events=5): """ Scrapes only the latest N events from UFC stats. This is useful for incremental updates to avoid re-scraping all data. Args: json_path (str): Path to save the latest events JSON file num_events (int): Number of latest events to scrape (default: 5) Returns: list: List of scraped event data """ soup = get_soup(BASE_URL) events = [] table = soup.find('table', class_='b-statistics__table-events') if not table: print("Could not find events table on the page.") return [] event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')] # Limit to the latest N events (events are ordered chronologically with most recent first) latest_event_rows = event_rows[:num_events] total_events = len(latest_event_rows) print(f"Found {len(event_rows)} total events. Scraping latest {total_events} events.") for i, row in enumerate(latest_event_rows): event_link_tag = row.find('a', class_='b-link b-link_style_black') if not event_link_tag or not event_link_tag.has_attr('href'): continue event_url = event_link_tag['href'] try: event_data = scrape_event_details(event_url) if event_data: events.append(event_data) print(f"Progress: {i+1}/{total_events} latest events scraped.") except Exception as e: print(f"Could not process event {event_url}. Error: {e}") return events