Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import time | |
import string | |
import concurrent.futures | |
import os | |
from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR | |
# --- Configuration --- | |
# The number of parallel threads to use for scraping fighter details. | |
# Increase this to scrape faster, but be mindful of rate limits. | |
MAX_WORKERS = 10 | |
# The delay in seconds between each request to a fighter's detail page. | |
# This is a politeness measure to avoid overwhelming the server. | |
REQUEST_DELAY = 0.1 | |
# --- End Configuration --- | |
BASE_URL = "http://ufcstats.com/statistics/fighters?page=all" | |
def get_soup(url): | |
"""Fetches and parses a URL into a BeautifulSoup object.""" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
return BeautifulSoup(response.text, 'html.parser') | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching {url}: {e}") | |
return None | |
def scrape_fighter_details(fighter_url): | |
"""Scrapes detailed statistics for a single fighter from their page.""" | |
print(f" Scraping fighter details from: {fighter_url}") | |
soup = get_soup(fighter_url) | |
if not soup: | |
return None | |
details = {} | |
# Career stats are usually in a list format on the fighter's page. | |
# This finds all list items within the career statistics div and extracts the data. | |
career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width') | |
if career_stats_div: | |
stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item') | |
for item in stats_list: | |
text = item.text.strip() | |
if ":" in text: | |
parts = text.split(":", 1) | |
key = parts[0].strip().lower().replace(' ', '_').replace('.', '') | |
value = parts[1].strip() | |
details[key] = value | |
return details | |
def process_fighter(fighter_data): | |
""" | |
Worker function for the thread pool. Scrapes details for a single fighter, | |
updates the dictionary, and applies a delay. | |
""" | |
fighter_url = fighter_data['url'] | |
try: | |
details = scrape_fighter_details(fighter_url) | |
if details: | |
fighter_data.update(details) | |
except Exception as e: | |
print(f" Could not scrape details for {fighter_url}: {e}") | |
time.sleep(REQUEST_DELAY) | |
return fighter_data | |
def scrape_all_fighters(json_path): | |
"""Scrapes all fighters from a-z pages using parallel processing.""" | |
# Step 1: Sequentially scrape all fighter list pages. This is fast. | |
initial_fighter_list = [] | |
alphabet = string.ascii_lowercase | |
print("--- Step 1: Collecting basic fighter info from all list pages ---") | |
for char in alphabet: | |
page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all" | |
print(f"Scanning page: {page_url}") | |
soup = get_soup(page_url) | |
if not soup: | |
continue | |
table = soup.find('table', class_='b-statistics__table') | |
if not table: | |
print(f"Could not find fighters table on page {page_url}") | |
continue | |
fighter_rows = table.find('tbody').find_all('tr')[1:] | |
if not fighter_rows: | |
continue | |
for row in fighter_rows: | |
cols = row.find_all('td') | |
if len(cols) < 11: | |
continue | |
fighter_link_tag = cols[0].find('a') | |
if not fighter_link_tag or not fighter_link_tag.has_attr('href'): | |
continue | |
initial_fighter_list.append({ | |
'first_name': cols[0].text.strip(), | |
'last_name': cols[1].text.strip(), | |
'nickname': cols[2].text.strip(), | |
'height': cols[3].text.strip(), | |
'weight_lbs': cols[4].text.strip(), | |
'reach_in': cols[5].text.strip(), | |
'stance': cols[6].text.strip(), | |
'wins': cols[7].text.strip(), | |
'losses': cols[8].text.strip(), | |
'draws': cols[9].text.strip(), | |
'belt': False if not cols[10].find('img') else True, | |
'url': fighter_link_tag['href'] | |
}) | |
print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---") | |
fighters_with_details = [] | |
total_fighters = len(initial_fighter_list) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: | |
results = executor.map(process_fighter, initial_fighter_list) | |
for i, fighter_data in enumerate(results): | |
fighters_with_details.append(fighter_data) | |
print(f"Progress: {i + 1}/{total_fighters} fighters scraped.") | |
if (i + 1) > 0 and (i + 1) % 50 == 0: | |
fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name'])) | |
with open(json_path, 'w') as f: | |
json.dump(fighters_with_details, f, indent=4) | |
fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name'])) | |
return fighters_with_details |