Spaces:

pareshmishra
/

MT564AITraining

Running

File size: 2,686 Bytes

2c72e40

import logging
import requests
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import trafilatura

logger = logging.getLogger(__name__)

class BaseScraper(ABC):
    """Base class for all scrapers"""
    
    def __init__(self, timeout: int = 30):
        self.timeout = timeout
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        }
    
    def fetch_url(self, url: str) -> Optional[str]:
        """Fetch content from URL using trafilatura"""
        try:
            logger.debug(f"Fetching URL: {url}")
            # Note: trafilatura.fetch_url doesn't accept a timeout parameter directly
            downloaded = trafilatura.fetch_url(url)
            if not downloaded:
                logger.error(f"Failed to download content from {url}")
                return None
            return downloaded
        except Exception as e:
            logger.error(f"Error fetching URL {url}: {str(e)}")
            return None
    
    def extract_text(self, html_content: str) -> Optional[str]:
        """Extract main text content from HTML"""
        try:
            if not html_content:
                return None
            text = trafilatura.extract(html_content)
            return text
        except Exception as e:
            logger.error(f"Error extracting text: {str(e)}")
            return None
    
    def scrape(self, url: str) -> Dict[str, Any]:
        """Main scrape method to be implemented by subclasses"""
        try:
            html_content = self.fetch_url(url)
            if not html_content:
                return {"success": False, "error": "Failed to fetch URL"}
            
            text_content = self.extract_text(html_content)
            if not text_content:
                return {"success": False, "error": "Failed to extract text content"}
            
            result = self.parse_content(html_content, text_content, url)
            result["success"] = True
            result["url"] = url
            result["text_content"] = text_content
            
            return result
        except Exception as e:
            logger.error(f"Error in scraping {url}: {str(e)}")
            return {"success": False, "error": str(e), "url": url}
    
    @abstractmethod
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse the content and extract structured data"""
        pass