File size: 5,179 Bytes
7036785
 
 
 
 
 
5b07ff1
9678fdb
7036785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5271c2e
7036785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5271c2e
7036785
 
 
9678fdb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import requests
from bs4 import BeautifulSoup
import json
import time
import string
import concurrent.futures
import os
from ..config import FIGHTERS_JSON_PATH, OUTPUT_DIR

# --- Configuration ---
# The number of parallel threads to use for scraping fighter details.
# Increase this to scrape faster, but be mindful of rate limits.
MAX_WORKERS = 10
# The delay in seconds between each request to a fighter's detail page.
# This is a politeness measure to avoid overwhelming the server.
REQUEST_DELAY = 0.1
# --- End Configuration ---

BASE_URL = "http://ufcstats.com/statistics/fighters?page=all"

def get_soup(url):
    """Fetches and parses a URL into a BeautifulSoup object."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def scrape_fighter_details(fighter_url):
    """Scrapes detailed statistics for a single fighter from their page."""
    print(f"  Scraping fighter details from: {fighter_url}")
    soup = get_soup(fighter_url)
    if not soup:
        return None

    details = {}
    
    # Career stats are usually in a list format on the fighter's page.
    # This finds all list items within the career statistics div and extracts the data.
    career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width')
    if career_stats_div:
        stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item')
        for item in stats_list:
            text = item.text.strip()
            if ":" in text:
                parts = text.split(":", 1)
                key = parts[0].strip().lower().replace(' ', '_').replace('.', '')
                value = parts[1].strip()
                details[key] = value
                
    return details

def process_fighter(fighter_data):
    """
    Worker function for the thread pool. Scrapes details for a single fighter,
    updates the dictionary, and applies a delay.
    """
    fighter_url = fighter_data['url']
    try:
        details = scrape_fighter_details(fighter_url)
        if details:
            fighter_data.update(details)
    except Exception as e:
        print(f"    Could not scrape details for {fighter_url}: {e}")
    
    time.sleep(REQUEST_DELAY)
    return fighter_data

def scrape_all_fighters(json_path):
    """Scrapes all fighters from a-z pages using parallel processing."""
    
    # Step 1: Sequentially scrape all fighter list pages. This is fast.
    initial_fighter_list = []
    alphabet = string.ascii_lowercase
    print("--- Step 1: Collecting basic fighter info from all list pages ---")
    for char in alphabet:
        page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all"
        print(f"Scanning page: {page_url}")

        soup = get_soup(page_url)
        if not soup:
            continue

        table = soup.find('table', class_='b-statistics__table')
        if not table:
            print(f"Could not find fighters table on page {page_url}")
            continue

        fighter_rows = table.find('tbody').find_all('tr')[1:]
        if not fighter_rows:
            continue
            
        for row in fighter_rows:
            cols = row.find_all('td')
            if len(cols) < 11:
                continue

            fighter_link_tag = cols[0].find('a')
            if not fighter_link_tag or not fighter_link_tag.has_attr('href'):
                continue
            
            initial_fighter_list.append({
                'first_name': cols[0].text.strip(),
                'last_name': cols[1].text.strip(),
                'nickname': cols[2].text.strip(),
                'height': cols[3].text.strip(),
                'weight_lbs': cols[4].text.strip(),
                'reach_in': cols[5].text.strip(),
                'stance': cols[6].text.strip(),
                'wins': cols[7].text.strip(),
                'losses': cols[8].text.strip(),
                'draws': cols[9].text.strip(),
                'belt': False if not cols[10].find('img') else True,
                'url': fighter_link_tag['href']
            })

    print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---")
    fighters_with_details = []
    total_fighters = len(initial_fighter_list)

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results = executor.map(process_fighter, initial_fighter_list)
        
        for i, fighter_data in enumerate(results):
            fighters_with_details.append(fighter_data)
            print(f"Progress: {i + 1}/{total_fighters} fighters scraped.")

            if (i + 1) > 0 and (i + 1) % 50 == 0:
                fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
                with open(json_path, 'w') as f:
                    json.dump(fighters_with_details, f, indent=4)
                
    fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
    return fighters_with_details