import requests from bs4 import BeautifulSoup import json import time import string import concurrent.futures import os from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR # --- Configuration --- # The number of parallel threads to use for scraping fighter details. # Increase this to scrape faster, but be mindful of rate limits. MAX_WORKERS = 10 # The delay in seconds between each request to a fighter's detail page. # This is a politeness measure to avoid overwhelming the server. REQUEST_DELAY = 0.1 # --- End Configuration --- BASE_URL = "http://ufcstats.com/statistics/fighters?page=all" def get_soup(url): """Fetches and parses a URL into a BeautifulSoup object.""" try: response = requests.get(url) response.raise_for_status() return BeautifulSoup(response.text, 'html.parser') except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def scrape_fighter_details(fighter_url): """Scrapes detailed statistics for a single fighter from their page.""" print(f" Scraping fighter details from: {fighter_url}") soup = get_soup(fighter_url) if not soup: return None details = {} # Career stats are usually in a list format on the fighter's page. # This finds all list items within the career statistics div and extracts the data. career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width') if career_stats_div: stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item') for item in stats_list: text = item.text.strip() if ":" in text: parts = text.split(":", 1) key = parts[0].strip().lower().replace(' ', '_').replace('.', '') value = parts[1].strip() details[key] = value return details def process_fighter(fighter_data): """ Worker function for the thread pool. Scrapes details for a single fighter, updates the dictionary, and applies a delay. """ fighter_url = fighter_data['url'] try: details = scrape_fighter_details(fighter_url) if details: fighter_data.update(details) except Exception as e: print(f" Could not scrape details for {fighter_url}: {e}") time.sleep(REQUEST_DELAY) return fighter_data def scrape_all_fighters(json_path): """Scrapes all fighters from a-z pages using parallel processing.""" # Step 1: Sequentially scrape all fighter list pages. This is fast. initial_fighter_list = [] alphabet = string.ascii_lowercase print("--- Step 1: Collecting basic fighter info from all list pages ---") for char in alphabet: page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all" print(f"Scanning page: {page_url}") soup = get_soup(page_url) if not soup: continue table = soup.find('table', class_='b-statistics__table') if not table: print(f"Could not find fighters table on page {page_url}") continue fighter_rows = table.find('tbody').find_all('tr')[1:] if not fighter_rows: continue for row in fighter_rows: cols = row.find_all('td') if len(cols) < 11: continue fighter_link_tag = cols[0].find('a') if not fighter_link_tag or not fighter_link_tag.has_attr('href'): continue initial_fighter_list.append({ 'first_name': cols[0].text.strip(), 'last_name': cols[1].text.strip(), 'nickname': cols[2].text.strip(), 'height': cols[3].text.strip(), 'weight_lbs': cols[4].text.strip(), 'reach_in': cols[5].text.strip(), 'stance': cols[6].text.strip(), 'wins': cols[7].text.strip(), 'losses': cols[8].text.strip(), 'draws': cols[9].text.strip(), 'belt': False if not cols[10].find('img') else True, 'url': fighter_link_tag['href'] }) print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---") fighters_with_details = [] total_fighters = len(initial_fighter_list) with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: results = executor.map(process_fighter, initial_fighter_list) for i, fighter_data in enumerate(results): fighters_with_details.append(fighter_data) print(f"Progress: {i + 1}/{total_fighters} fighters scraped.") if (i + 1) > 0 and (i + 1) % 50 == 0: fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name'])) with open(json_path, 'w') as f: json.dump(fighters_with_details, f, indent=4) fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name'])) return fighters_with_details