Spaces:
Running
Running
import logging | |
import requests | |
from abc import ABC, abstractmethod | |
from typing import Dict, Any, Optional | |
import trafilatura | |
logger = logging.getLogger(__name__) | |
class BaseScraper(ABC): | |
"""Base class for all scrapers""" | |
def __init__(self, timeout: int = 30): | |
self.timeout = timeout | |
self.headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Accept-Language": "en-US,en;q=0.9", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
} | |
def fetch_url(self, url: str) -> Optional[str]: | |
"""Fetch content from URL using trafilatura""" | |
try: | |
logger.debug(f"Fetching URL: {url}") | |
# Note: trafilatura.fetch_url doesn't accept a timeout parameter directly | |
downloaded = trafilatura.fetch_url(url) | |
if not downloaded: | |
logger.error(f"Failed to download content from {url}") | |
return None | |
return downloaded | |
except Exception as e: | |
logger.error(f"Error fetching URL {url}: {str(e)}") | |
return None | |
def extract_text(self, html_content: str) -> Optional[str]: | |
"""Extract main text content from HTML""" | |
try: | |
if not html_content: | |
return None | |
text = trafilatura.extract(html_content) | |
return text | |
except Exception as e: | |
logger.error(f"Error extracting text: {str(e)}") | |
return None | |
def scrape(self, url: str) -> Dict[str, Any]: | |
"""Main scrape method to be implemented by subclasses""" | |
try: | |
html_content = self.fetch_url(url) | |
if not html_content: | |
return {"success": False, "error": "Failed to fetch URL"} | |
text_content = self.extract_text(html_content) | |
if not text_content: | |
return {"success": False, "error": "Failed to extract text content"} | |
result = self.parse_content(html_content, text_content, url) | |
result["success"] = True | |
result["url"] = url | |
result["text_content"] = text_content | |
return result | |
except Exception as e: | |
logger.error(f"Error in scraping {url}: {str(e)}") | |
return {"success": False, "error": str(e), "url": url} | |
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: | |
"""Parse the content and extract structured data""" | |
pass | |