File size: 2,686 Bytes
2c72e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import logging
import requests
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import trafilatura

logger = logging.getLogger(__name__)

class BaseScraper(ABC):
    """Base class for all scrapers"""
    
    def __init__(self, timeout: int = 30):
        self.timeout = timeout
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        }
    
    def fetch_url(self, url: str) -> Optional[str]:
        """Fetch content from URL using trafilatura"""
        try:
            logger.debug(f"Fetching URL: {url}")
            # Note: trafilatura.fetch_url doesn't accept a timeout parameter directly
            downloaded = trafilatura.fetch_url(url)
            if not downloaded:
                logger.error(f"Failed to download content from {url}")
                return None
            return downloaded
        except Exception as e:
            logger.error(f"Error fetching URL {url}: {str(e)}")
            return None
    
    def extract_text(self, html_content: str) -> Optional[str]:
        """Extract main text content from HTML"""
        try:
            if not html_content:
                return None
            text = trafilatura.extract(html_content)
            return text
        except Exception as e:
            logger.error(f"Error extracting text: {str(e)}")
            return None
    
    def scrape(self, url: str) -> Dict[str, Any]:
        """Main scrape method to be implemented by subclasses"""
        try:
            html_content = self.fetch_url(url)
            if not html_content:
                return {"success": False, "error": "Failed to fetch URL"}
            
            text_content = self.extract_text(html_content)
            if not text_content:
                return {"success": False, "error": "Failed to extract text content"}
            
            result = self.parse_content(html_content, text_content, url)
            result["success"] = True
            result["url"] = url
            result["text_content"] = text_content
            
            return result
        except Exception as e:
            logger.error(f"Error in scraping {url}: {str(e)}")
            return {"success": False, "error": str(e), "url": url}
    
    @abstractmethod
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse the content and extract structured data"""
        pass