Spaces:
Running
Running
File size: 2,686 Bytes
2c72e40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import logging
import requests
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import trafilatura
logger = logging.getLogger(__name__)
class BaseScraper(ABC):
"""Base class for all scrapers"""
def __init__(self, timeout: int = 30):
self.timeout = timeout
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}
def fetch_url(self, url: str) -> Optional[str]:
"""Fetch content from URL using trafilatura"""
try:
logger.debug(f"Fetching URL: {url}")
# Note: trafilatura.fetch_url doesn't accept a timeout parameter directly
downloaded = trafilatura.fetch_url(url)
if not downloaded:
logger.error(f"Failed to download content from {url}")
return None
return downloaded
except Exception as e:
logger.error(f"Error fetching URL {url}: {str(e)}")
return None
def extract_text(self, html_content: str) -> Optional[str]:
"""Extract main text content from HTML"""
try:
if not html_content:
return None
text = trafilatura.extract(html_content)
return text
except Exception as e:
logger.error(f"Error extracting text: {str(e)}")
return None
def scrape(self, url: str) -> Dict[str, Any]:
"""Main scrape method to be implemented by subclasses"""
try:
html_content = self.fetch_url(url)
if not html_content:
return {"success": False, "error": "Failed to fetch URL"}
text_content = self.extract_text(html_content)
if not text_content:
return {"success": False, "error": "Failed to extract text content"}
result = self.parse_content(html_content, text_content, url)
result["success"] = True
result["url"] = url
result["text_content"] = text_content
return result
except Exception as e:
logger.error(f"Error in scraping {url}: {str(e)}")
return {"success": False, "error": str(e), "url": url}
@abstractmethod
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse the content and extract structured data"""
pass
|