Spaces:

AlvaroMros
/

ufc-predictor

Running

App Files Files Community

ufc-predictor / src /scrape /scrape_fighters.py

AlvaroMros

Refactor imports to use absolute paths and clean up scripts

9678fdb 17 days ago

raw

history blame contribute delete

5.18 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import time
	import string
	import concurrent.futures
	import os
	from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR

	# --- Configuration ---
	# The number of parallel threads to use for scraping fighter details.
	# Increase this to scrape faster, but be mindful of rate limits.
	MAX_WORKERS = 10
	# The delay in seconds between each request to a fighter's detail page.
	# This is a politeness measure to avoid overwhelming the server.
	REQUEST_DELAY = 0.1
	# --- End Configuration ---

	BASE_URL = "http://ufcstats.com/statistics/fighters?page=all"

	def get_soup(url):
	"""Fetches and parses a URL into a BeautifulSoup object."""
	try:
	response = requests.get(url)
	response.raise_for_status()
	return BeautifulSoup(response.text, 'html.parser')
	except requests.exceptions.RequestException as e:
	print(f"Error fetching {url}: {e}")
	return None

	def scrape_fighter_details(fighter_url):
	"""Scrapes detailed statistics for a single fighter from their page."""
	print(f" Scraping fighter details from: {fighter_url}")
	soup = get_soup(fighter_url)
	if not soup:
	return None

	details = {}

	# Career stats are usually in a list format on the fighter's page.
	# This finds all list items within the career statistics div and extracts the data.
	career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width')
	if career_stats_div:
	stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item')
	for item in stats_list:
	text = item.text.strip()
	if ":" in text:
	parts = text.split(":", 1)
	key = parts[0].strip().lower().replace(' ', '_').replace('.', '')
	value = parts[1].strip()
	details[key] = value

	return details

	def process_fighter(fighter_data):
	"""
	Worker function for the thread pool. Scrapes details for a single fighter,
	updates the dictionary, and applies a delay.
	"""
	fighter_url = fighter_data['url']
	try:
	details = scrape_fighter_details(fighter_url)
	if details:
	fighter_data.update(details)
	except Exception as e:
	print(f" Could not scrape details for {fighter_url}: {e}")

	time.sleep(REQUEST_DELAY)
	return fighter_data

	def scrape_all_fighters(json_path):
	"""Scrapes all fighters from a-z pages using parallel processing."""

	# Step 1: Sequentially scrape all fighter list pages. This is fast.
	initial_fighter_list = []
	alphabet = string.ascii_lowercase
	print("--- Step 1: Collecting basic fighter info from all list pages ---")
	for char in alphabet:
	page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all"
	print(f"Scanning page: {page_url}")

	soup = get_soup(page_url)
	if not soup:
	continue

	table = soup.find('table', class_='b-statistics__table')
	if not table:
	print(f"Could not find fighters table on page {page_url}")
	continue

	fighter_rows = table.find('tbody').find_all('tr')[1:]
	if not fighter_rows:
	continue

	for row in fighter_rows:
	cols = row.find_all('td')
	if len(cols) < 11:
	continue

	fighter_link_tag = cols[0].find('a')
	if not fighter_link_tag or not fighter_link_tag.has_attr('href'):
	continue

	initial_fighter_list.append({
	'first_name': cols[0].text.strip(),
	'last_name': cols[1].text.strip(),
	'nickname': cols[2].text.strip(),
	'height': cols[3].text.strip(),
	'weight_lbs': cols[4].text.strip(),
	'reach_in': cols[5].text.strip(),
	'stance': cols[6].text.strip(),
	'wins': cols[7].text.strip(),
	'losses': cols[8].text.strip(),
	'draws': cols[9].text.strip(),
	'belt': False if not cols[10].find('img') else True,
	'url': fighter_link_tag['href']
	})

	print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---")
	fighters_with_details = []
	total_fighters = len(initial_fighter_list)

	with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
	results = executor.map(process_fighter, initial_fighter_list)

	for i, fighter_data in enumerate(results):
	fighters_with_details.append(fighter_data)
	print(f"Progress: {i + 1}/{total_fighters} fighters scraped.")

	if (i + 1) > 0 and (i + 1) % 50 == 0:
	fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
	with open(json_path, 'w') as f:
	json.dump(fighters_with_details, f, indent=4)

	fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
	return fighters_with_details