Spaces:
Runtime error
Runtime error
File size: 1,198 Bytes
1349210 ee2e25a 1349210 ee2e25a 1349210 ee2e25a 1349210 ee2e25a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import requests
from bs4 import BeautifulSoup
import re
# Clean HTML tags
TAG_CLEANER = re.compile(r"<[^>]+>")
def clean_text(text):
"""Clean and normalize text"""
text = TAG_CLEANER.sub('', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def scrape_url(url):
"""Efficient content extraction with fallbacks"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
'Accept-Language': 'en-US,en;q=0.9'
}
response = requests.get(url, timeout=8, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
# Try semantic tags first
for selector in ['article', 'main', '.article-body', '.post-content']:
if element := soup.select_one(selector):
return clean_text(element.get_text())
# Fallback to paragraph aggregation
paragraphs = soup.find_all('p')
content = " ".join(p.get_text().strip() for p in paragraphs)
return clean_text(content)[:5000]
except Exception as e:
return f"β οΈ Error: Could not retrieve content from {url}" |