Spaces:

AlvaroMros
/

ufc-predictor

Running

App Files Files Community

ufc-predictor / src /scrape /scrape_fights.py

AlvaroMros

Refactor imports to use absolute paths and clean up scripts

9678fdb 17 days ago

raw

history blame contribute delete

10.1 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import time
	import concurrent.futures
	from ..config import EVENTS_JSON_PATH

	# --- Configuration ---
	# The number of parallel threads to use for scraping fight details.
	# Increase this to scrape faster, but be mindful of rate limits.
	MAX_WORKERS = 10
	# The delay in seconds between each request to a fight's detail page.
	# This is a politeness measure to avoid overwhelming the server.
	REQUEST_DELAY = 0.1
	# --- End Configuration ---

	BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"

	def get_soup(url):
	response = requests.get(url)
	response.raise_for_status() # Raise an exception for bad status codes
	return BeautifulSoup(response.text, 'html.parser')

	def scrape_fight_details(fight_url):
	print(f" Scraping fight: {fight_url}")
	soup = get_soup(fight_url)

	# On upcoming fight pages, there's a specific div. If it exists, skip.
	if soup.find('div', class_='b-fight-details__content-abbreviated'):
	print(f" Upcoming fight, no details available: {fight_url}")
	return None

	tables = soup.find_all('table', class_='b-fight-details__table')

	if not tables:
	print(f" No stats tables found on {fight_url}")
	return None

	fight_details = {"fighter_1_stats": {}, "fighter_2_stats": {}}

	# Helper to extract stats. The stats for both fighters are in <p> tags within a single <td>
	def extract_stats_from_cell(cell, col_name):
	ps = cell.find_all('p')
	if len(ps) == 2:
	fight_details["fighter_1_stats"][col_name] = ps[0].text.strip()
	fight_details["fighter_2_stats"][col_name] = ps[1].text.strip()

	# --- Totals Table ---
	# The first table contains overall stats
	totals_table = tables[0]
	totals_tbody = totals_table.find('tbody')
	if totals_tbody:
	totals_row = totals_tbody.find('tr')
	if totals_row:
	totals_cols = totals_row.find_all('td')
	stat_cols = {
	1: 'kd', 2: 'sig_str', 3: 'sig_str_percent', 4: 'total_str',
	5: 'td', 6: 'td_percent', 7: 'sub_att', 8: 'rev', 9: 'ctrl'
	}
	for index, name in stat_cols.items():
	if index < len(totals_cols):
	extract_stats_from_cell(totals_cols[index], name)

	# --- Significant Strikes Table ---
	# The second table contains significant strike details
	if len(tables) > 1:
	sig_strikes_table = tables[1]
	sig_strikes_tbody = sig_strikes_table.find('tbody')
	if sig_strikes_tbody:
	sig_strikes_row = sig_strikes_tbody.find('tr')
	if sig_strikes_row:
	sig_strikes_cols = sig_strikes_row.find_all('td')
	stat_cols = {
	2: 'sig_str_head', 3: 'sig_str_body', 4: 'sig_str_leg',
	5: 'sig_str_distance', 6: 'sig_str_clinch', 7: 'sig_str_ground'
	}
	for index, name in stat_cols.items():
	if index < len(sig_strikes_cols):
	extract_stats_from_cell(sig_strikes_cols[index], name)

	return fight_details

	def fetch_fight_details_worker(fight_url):
	"""
	Worker function for the thread pool. Scrapes details for a single fight
	and applies a delay to be polite to the server.
	"""
	try:
	details = scrape_fight_details(fight_url)
	time.sleep(REQUEST_DELAY)
	return details
	except Exception as e:
	print(f" Could not scrape fight details for {fight_url}: {e}")
	time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe
	return None

	def scrape_event_details(event_url):
	print(f"Scraping event: {event_url}")
	soup = get_soup(event_url)
	event_details = {}

	# Extract event name
	event_details['name'] = soup.find('h2', class_='b-content__title').text.strip()

	# Extract event date and location
	info_list = soup.find('ul', class_='b-list__box-list')
	list_items = info_list.find_all('li', class_='b-list__box-list-item')
	event_details['date'] = list_items[0].text.split(':')[1].strip()
	event_details['location'] = list_items[1].text.split(':')[1].strip()

	# Step 1: Gather base info and URLs for all fights on the event page.
	fights_to_process = []
	fight_table = soup.find('table', class_='b-fight-details__table')
	if fight_table:
	rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
	for row in rows:
	cols = row.find_all('td', class_='b-fight-details__table-col')

	fighter1 = cols[1].find_all('p')[0].text.strip()
	fighter2 = cols[1].find_all('p')[1].text.strip()

	# Determine the winner from the W/L column based on the example provided.
	winner = None
	result_ps = cols[0].find_all('p')

	# This logic handles the structure seen in the example file.
	if len(result_ps) == 1:
	result_text = result_ps[0].text.strip().lower()
	if 'win' in result_text:
	# When one 'win' is present, it corresponds to the first fighter listed.
	winner = fighter1
	elif 'draw' in result_text:
	winner = "Draw"
	elif 'nc' in result_text:
	winner = "NC"

	# This is a defensive case in case the structure has two <p> tags.
	elif len(result_ps) == 2:
	if 'win' in result_ps[0].text.strip().lower():
	winner = fighter1
	elif 'win' in result_ps[1].text.strip().lower():
	winner = fighter2
	elif 'draw' in result_ps[0].text.strip().lower():
	winner = "Draw"
	elif 'nc' in result_ps[0].text.strip().lower():
	winner = "NC"

	fight = {
	'fighter_1': fighter1,
	'fighter_2': fighter2,
	'winner': winner,
	'weight_class': cols[6].text.strip(),
	'method': ' '.join(cols[7].stripped_strings),
	'round': cols[8].text.strip(),
	'time': cols[9].text.strip(),
	'url': row['data-link']
	}
	fights_to_process.append(fight)

	# Step 2: Scrape the details for all fights in parallel.
	fight_urls = [fight['url'] for fight in fights_to_process]
	completed_fights = []

	if fight_urls:
	with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
	# The map function maintains the order of results.
	fight_details_list = executor.map(fetch_fight_details_worker, fight_urls)

	for i, details in enumerate(fight_details_list):
	fight_data = fights_to_process[i]
	del fight_data['url'] # Clean up the temporary URL
	fight_data['details'] = details if details else None
	completed_fights.append(fight_data)

	event_details['fights'] = completed_fights
	return event_details

	def scrape_all_events(json_path):
	soup = get_soup(BASE_URL)
	events = []

	table = soup.find('table', class_='b-statistics__table-events')
	if not table:
	print("Could not find events table on the page.")
	return []

	event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')]
	total_events = len(event_rows)
	print(f"Found {total_events} events to scrape.")

	for i, row in enumerate(event_rows):
	event_link_tag = row.find('a', class_='b-link b-link_style_black')
	if not event_link_tag or not event_link_tag.has_attr('href'):
	continue

	event_url = event_link_tag['href']

	try:
	event_data = scrape_event_details(event_url)
	if event_data:
	events.append(event_data)

	print(f"Progress: {i+1}/{total_events} events scraped.")

	if (i + 1) % 10 == 0:
	print(f"--- Saving progress: {i + 1} of {total_events} events saved. ---")
	with open(json_path, 'w') as f:
	json.dump(events, f, indent=4)
	except Exception as e:
	print(f"Could not process event {event_url}. Error: {e}")

	return events

	def scrape_latest_events(json_path, num_events=5):
	"""
	Scrapes only the latest N events from UFC stats.
	This is useful for incremental updates to avoid re-scraping all data.

	Args:
	json_path (str): Path to save the latest events JSON file
	num_events (int): Number of latest events to scrape (default: 5)

	Returns:
	list: List of scraped event data
	"""
	soup = get_soup(BASE_URL)
	events = []

	table = soup.find('table', class_='b-statistics__table-events')
	if not table:
	print("Could not find events table on the page.")
	return []

	event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')]

	# Limit to the latest N events (events are ordered chronologically with most recent first)
	latest_event_rows = event_rows[:num_events]
	total_events = len(latest_event_rows)
	print(f"Found {len(event_rows)} total events. Scraping latest {total_events} events.")

	for i, row in enumerate(latest_event_rows):
	event_link_tag = row.find('a', class_='b-link b-link_style_black')
	if not event_link_tag or not event_link_tag.has_attr('href'):
	continue

	event_url = event_link_tag['href']

	try:
	event_data = scrape_event_details(event_url)
	if event_data:
	events.append(event_data)

	print(f"Progress: {i+1}/{total_events} latest events scraped.")
	except Exception as e:
	print(f"Could not process event {event_url}. Error: {e}")

	return events