MT564AITraining / scrapers /base_scraper.py
pareshmishra
Add full project source files for MT564 AI
2c72e40
import logging
import requests
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import trafilatura
logger = logging.getLogger(__name__)
class BaseScraper(ABC):
"""Base class for all scrapers"""
def __init__(self, timeout: int = 30):
self.timeout = timeout
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}
def fetch_url(self, url: str) -> Optional[str]:
"""Fetch content from URL using trafilatura"""
try:
logger.debug(f"Fetching URL: {url}")
# Note: trafilatura.fetch_url doesn't accept a timeout parameter directly
downloaded = trafilatura.fetch_url(url)
if not downloaded:
logger.error(f"Failed to download content from {url}")
return None
return downloaded
except Exception as e:
logger.error(f"Error fetching URL {url}: {str(e)}")
return None
def extract_text(self, html_content: str) -> Optional[str]:
"""Extract main text content from HTML"""
try:
if not html_content:
return None
text = trafilatura.extract(html_content)
return text
except Exception as e:
logger.error(f"Error extracting text: {str(e)}")
return None
def scrape(self, url: str) -> Dict[str, Any]:
"""Main scrape method to be implemented by subclasses"""
try:
html_content = self.fetch_url(url)
if not html_content:
return {"success": False, "error": "Failed to fetch URL"}
text_content = self.extract_text(html_content)
if not text_content:
return {"success": False, "error": "Failed to extract text content"}
result = self.parse_content(html_content, text_content, url)
result["success"] = True
result["url"] = url
result["text_content"] = text_content
return result
except Exception as e:
logger.error(f"Error in scraping {url}: {str(e)}")
return {"success": False, "error": str(e), "url": url}
@abstractmethod
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse the content and extract structured data"""
pass