Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import time | |
import concurrent.futures | |
from ..config import EVENTS_JSON_PATH | |
# --- Configuration --- | |
# The number of parallel threads to use for scraping fight details. | |
# Increase this to scrape faster, but be mindful of rate limits. | |
MAX_WORKERS = 10 | |
# The delay in seconds between each request to a fight's detail page. | |
# This is a politeness measure to avoid overwhelming the server. | |
REQUEST_DELAY = 0.1 | |
# --- End Configuration --- | |
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all" | |
def get_soup(url): | |
response = requests.get(url) | |
response.raise_for_status() # Raise an exception for bad status codes | |
return BeautifulSoup(response.text, 'html.parser') | |
def scrape_fight_details(fight_url): | |
print(f" Scraping fight: {fight_url}") | |
soup = get_soup(fight_url) | |
# On upcoming fight pages, there's a specific div. If it exists, skip. | |
if soup.find('div', class_='b-fight-details__content-abbreviated'): | |
print(f" Upcoming fight, no details available: {fight_url}") | |
return None | |
tables = soup.find_all('table', class_='b-fight-details__table') | |
if not tables: | |
print(f" No stats tables found on {fight_url}") | |
return None | |
fight_details = {"fighter_1_stats": {}, "fighter_2_stats": {}} | |
# Helper to extract stats. The stats for both fighters are in <p> tags within a single <td> | |
def extract_stats_from_cell(cell, col_name): | |
ps = cell.find_all('p') | |
if len(ps) == 2: | |
fight_details["fighter_1_stats"][col_name] = ps[0].text.strip() | |
fight_details["fighter_2_stats"][col_name] = ps[1].text.strip() | |
# --- Totals Table --- | |
# The first table contains overall stats | |
totals_table = tables[0] | |
totals_tbody = totals_table.find('tbody') | |
if totals_tbody: | |
totals_row = totals_tbody.find('tr') | |
if totals_row: | |
totals_cols = totals_row.find_all('td') | |
stat_cols = { | |
1: 'kd', 2: 'sig_str', 3: 'sig_str_percent', 4: 'total_str', | |
5: 'td', 6: 'td_percent', 7: 'sub_att', 8: 'rev', 9: 'ctrl' | |
} | |
for index, name in stat_cols.items(): | |
if index < len(totals_cols): | |
extract_stats_from_cell(totals_cols[index], name) | |
# --- Significant Strikes Table --- | |
# The second table contains significant strike details | |
if len(tables) > 1: | |
sig_strikes_table = tables[1] | |
sig_strikes_tbody = sig_strikes_table.find('tbody') | |
if sig_strikes_tbody: | |
sig_strikes_row = sig_strikes_tbody.find('tr') | |
if sig_strikes_row: | |
sig_strikes_cols = sig_strikes_row.find_all('td') | |
stat_cols = { | |
2: 'sig_str_head', 3: 'sig_str_body', 4: 'sig_str_leg', | |
5: 'sig_str_distance', 6: 'sig_str_clinch', 7: 'sig_str_ground' | |
} | |
for index, name in stat_cols.items(): | |
if index < len(sig_strikes_cols): | |
extract_stats_from_cell(sig_strikes_cols[index], name) | |
return fight_details | |
def fetch_fight_details_worker(fight_url): | |
""" | |
Worker function for the thread pool. Scrapes details for a single fight | |
and applies a delay to be polite to the server. | |
""" | |
try: | |
details = scrape_fight_details(fight_url) | |
time.sleep(REQUEST_DELAY) | |
return details | |
except Exception as e: | |
print(f" Could not scrape fight details for {fight_url}: {e}") | |
time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe | |
return None | |
def scrape_event_details(event_url): | |
print(f"Scraping event: {event_url}") | |
soup = get_soup(event_url) | |
event_details = {} | |
# Extract event name | |
event_details['name'] = soup.find('h2', class_='b-content__title').text.strip() | |
# Extract event date and location | |
info_list = soup.find('ul', class_='b-list__box-list') | |
list_items = info_list.find_all('li', class_='b-list__box-list-item') | |
event_details['date'] = list_items[0].text.split(':')[1].strip() | |
event_details['location'] = list_items[1].text.split(':')[1].strip() | |
# Step 1: Gather base info and URLs for all fights on the event page. | |
fights_to_process = [] | |
fight_table = soup.find('table', class_='b-fight-details__table') | |
if fight_table: | |
rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row') | |
for row in rows: | |
cols = row.find_all('td', class_='b-fight-details__table-col') | |
fighter1 = cols[1].find_all('p')[0].text.strip() | |
fighter2 = cols[1].find_all('p')[1].text.strip() | |
# Determine the winner from the W/L column based on the example provided. | |
winner = None | |
result_ps = cols[0].find_all('p') | |
# This logic handles the structure seen in the example file. | |
if len(result_ps) == 1: | |
result_text = result_ps[0].text.strip().lower() | |
if 'win' in result_text: | |
# When one 'win' is present, it corresponds to the first fighter listed. | |
winner = fighter1 | |
elif 'draw' in result_text: | |
winner = "Draw" | |
elif 'nc' in result_text: | |
winner = "NC" | |
# This is a defensive case in case the structure has two <p> tags. | |
elif len(result_ps) == 2: | |
if 'win' in result_ps[0].text.strip().lower(): | |
winner = fighter1 | |
elif 'win' in result_ps[1].text.strip().lower(): | |
winner = fighter2 | |
elif 'draw' in result_ps[0].text.strip().lower(): | |
winner = "Draw" | |
elif 'nc' in result_ps[0].text.strip().lower(): | |
winner = "NC" | |
fight = { | |
'fighter_1': fighter1, | |
'fighter_2': fighter2, | |
'winner': winner, | |
'weight_class': cols[6].text.strip(), | |
'method': ' '.join(cols[7].stripped_strings), | |
'round': cols[8].text.strip(), | |
'time': cols[9].text.strip(), | |
'url': row['data-link'] | |
} | |
fights_to_process.append(fight) | |
# Step 2: Scrape the details for all fights in parallel. | |
fight_urls = [fight['url'] for fight in fights_to_process] | |
completed_fights = [] | |
if fight_urls: | |
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: | |
# The map function maintains the order of results. | |
fight_details_list = executor.map(fetch_fight_details_worker, fight_urls) | |
for i, details in enumerate(fight_details_list): | |
fight_data = fights_to_process[i] | |
del fight_data['url'] # Clean up the temporary URL | |
fight_data['details'] = details if details else None | |
completed_fights.append(fight_data) | |
event_details['fights'] = completed_fights | |
return event_details | |
def scrape_all_events(json_path): | |
soup = get_soup(BASE_URL) | |
events = [] | |
table = soup.find('table', class_='b-statistics__table-events') | |
if not table: | |
print("Could not find events table on the page.") | |
return [] | |
event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')] | |
total_events = len(event_rows) | |
print(f"Found {total_events} events to scrape.") | |
for i, row in enumerate(event_rows): | |
event_link_tag = row.find('a', class_='b-link b-link_style_black') | |
if not event_link_tag or not event_link_tag.has_attr('href'): | |
continue | |
event_url = event_link_tag['href'] | |
try: | |
event_data = scrape_event_details(event_url) | |
if event_data: | |
events.append(event_data) | |
print(f"Progress: {i+1}/{total_events} events scraped.") | |
if (i + 1) % 10 == 0: | |
print(f"--- Saving progress: {i + 1} of {total_events} events saved. ---") | |
with open(json_path, 'w') as f: | |
json.dump(events, f, indent=4) | |
except Exception as e: | |
print(f"Could not process event {event_url}. Error: {e}") | |
return events | |
def scrape_latest_events(json_path, num_events=5): | |
""" | |
Scrapes only the latest N events from UFC stats. | |
This is useful for incremental updates to avoid re-scraping all data. | |
Args: | |
json_path (str): Path to save the latest events JSON file | |
num_events (int): Number of latest events to scrape (default: 5) | |
Returns: | |
list: List of scraped event data | |
""" | |
soup = get_soup(BASE_URL) | |
events = [] | |
table = soup.find('table', class_='b-statistics__table-events') | |
if not table: | |
print("Could not find events table on the page.") | |
return [] | |
event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')] | |
# Limit to the latest N events (events are ordered chronologically with most recent first) | |
latest_event_rows = event_rows[:num_events] | |
total_events = len(latest_event_rows) | |
print(f"Found {len(event_rows)} total events. Scraping latest {total_events} events.") | |
for i, row in enumerate(latest_event_rows): | |
event_link_tag = row.find('a', class_='b-link b-link_style_black') | |
if not event_link_tag or not event_link_tag.has_attr('href'): | |
continue | |
event_url = event_link_tag['href'] | |
try: | |
event_data = scrape_event_details(event_url) | |
if event_data: | |
events.append(event_data) | |
print(f"Progress: {i+1}/{total_events} latest events scraped.") | |
except Exception as e: | |
print(f"Could not process event {event_url}. Error: {e}") | |
return events | |