import requests from bs4 import BeautifulSoup import re # Clean HTML tags TAG_CLEANER = re.compile(r"<[^>]+>") def clean_text(text): """Clean and normalize text""" text = TAG_CLEANER.sub('', text) text = re.sub(r'\s+', ' ', text).strip() return text def scrape_url(url): """Efficient content extraction with fallbacks""" try: headers = { 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)', 'Accept-Language': 'en-US,en;q=0.9' } response = requests.get(url, timeout=8, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, 'lxml') # Try semantic tags first for selector in ['article', 'main', '.article-body', '.post-content']: if element := soup.select_one(selector): return clean_text(element.get_text()) # Fallback to paragraph aggregation paragraphs = soup.find_all('p') content = " ".join(p.get_text().strip() for p in paragraphs) return clean_text(content)[:5000] except Exception as e: return f"⚠️ Error: Could not retrieve content from {url}"