import logging import requests from abc import ABC, abstractmethod from typing import Dict, Any, Optional import trafilatura logger = logging.getLogger(__name__) class BaseScraper(ABC): """Base class for all scrapers""" def __init__(self, timeout: int = 30): self.timeout = timeout self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", } def fetch_url(self, url: str) -> Optional[str]: """Fetch content from URL using trafilatura""" try: logger.debug(f"Fetching URL: {url}") # Note: trafilatura.fetch_url doesn't accept a timeout parameter directly downloaded = trafilatura.fetch_url(url) if not downloaded: logger.error(f"Failed to download content from {url}") return None return downloaded except Exception as e: logger.error(f"Error fetching URL {url}: {str(e)}") return None def extract_text(self, html_content: str) -> Optional[str]: """Extract main text content from HTML""" try: if not html_content: return None text = trafilatura.extract(html_content) return text except Exception as e: logger.error(f"Error extracting text: {str(e)}") return None def scrape(self, url: str) -> Dict[str, Any]: """Main scrape method to be implemented by subclasses""" try: html_content = self.fetch_url(url) if not html_content: return {"success": False, "error": "Failed to fetch URL"} text_content = self.extract_text(html_content) if not text_content: return {"success": False, "error": "Failed to extract text content"} result = self.parse_content(html_content, text_content, url) result["success"] = True result["url"] = url result["text_content"] = text_content return result except Exception as e: logger.error(f"Error in scraping {url}: {str(e)}") return {"success": False, "error": str(e), "url": url} @abstractmethod def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: """Parse the content and extract structured data""" pass