File size: 1,198 Bytes
1349210
 
ee2e25a
 
 
 
 
 
 
 
 
 
1349210
 
ee2e25a
1349210
ee2e25a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1349210
ee2e25a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests
from bs4 import BeautifulSoup
import re

# Clean HTML tags
TAG_CLEANER = re.compile(r"<[^>]+>")

def clean_text(text):
    """Clean and normalize text"""
    text = TAG_CLEANER.sub('', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def scrape_url(url):
    """Efficient content extraction with fallbacks"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        response = requests.get(url, timeout=8, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Try semantic tags first
        for selector in ['article', 'main', '.article-body', '.post-content']:
            if element := soup.select_one(selector):
                return clean_text(element.get_text())
        
        # Fallback to paragraph aggregation
        paragraphs = soup.find_all('p')
        content = " ".join(p.get_text().strip() for p in paragraphs)
        return clean_text(content)[:5000]
    
    except Exception as e:
        return f"⚠️ Error: Could not retrieve content from {url}"