File size: 6,155 Bytes
2c72e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import logging
import re
from typing import Dict, Any, List, Optional
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
from .base_scraper import BaseScraper

logger = logging.getLogger(__name__)

class HoroscopeScraper(BaseScraper):
    """Scraper for horoscope websites"""
    
    # List of valid zodiac signs
    ZODIAC_SIGNS = [
        "aries", "taurus", "gemini", "cancer", 
        "leo", "virgo", "libra", "scorpio", 
        "sagittarius", "capricorn", "aquarius", "pisces"
    ]
    
    def __init__(self, timeout: int = 30):
        super().__init__(timeout)
        self.source_name = "Generic Horoscope Site"
    
    def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]:
        """
        Scrape horoscopes for all zodiac signs
        
        Args:
            base_url: Base URL for the horoscope site
            date_str: Optional date string in format YYYY-MM-DD
            
        Returns:
            List of dictionaries with horoscope data for each sign
        """
        results = []
        
        for sign in self.ZODIAC_SIGNS:
            try:
                horoscope_data = self.scrape_sign(base_url, sign, date_str)
                if horoscope_data and horoscope_data.get('success', False):
                    results.append(horoscope_data)
            except Exception as e:
                logger.error(f"Error scraping {sign} horoscope: {str(e)}")
        
        return results
    
    def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]:
        """
        Scrape horoscope for a specific zodiac sign
        
        Args:
            base_url: Base URL for the horoscope site
            sign: Zodiac sign (lowercase)
            date_str: Optional date string in format YYYY-MM-DD
            
        Returns:
            Dictionary with horoscope data
        """
        # Validate sign
        if sign.lower() not in self.ZODIAC_SIGNS:
            return {"success": False, "error": f"Invalid zodiac sign: {sign}"}
        
        # Format the URL for the specific sign
        url = self._format_url(base_url, sign, date_str)
        
        # Use the base scraper method to get the content
        result = self.scrape(url)
        
        # Add additional horoscope-specific metadata
        result["sign"] = sign.lower()
        result["scraped_date"] = date_str if date_str else date.today().isoformat()
        result["source_name"] = self.source_name
        
        return result
    
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse horoscope content and extract structured data"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract metadata - this is a generic implementation
            # Specific horoscope sites will need custom implementations
            result = {
                "type": "horoscope",
                "prediction": self._extract_prediction(soup, text_content),
                "date": self._extract_date(soup, url),
                "source": self._extract_domain(url),
            }
            
            return result
        except Exception as e:
            logger.error(f"Error parsing horoscope content: {str(e)}")
            return {"type": "horoscope", "error_parsing": str(e)}
    
    def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
        """
        Format URL for horoscope site. This is a generic implementation.
        Should be overridden in specific scrapers.
        """
        # Default implementation just appends the sign to the base URL
        return f"{base_url.rstrip('/')}/{sign.lower()}"
    
    def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
        """
        Extract horoscope prediction text.
        Generic implementation - should be overridden in specific scrapers.
        """
        # Default implementation just returns the first paragraph or the text content
        prediction = ""
        
        # Look for common horoscope content containers
        containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p')
        if containers:
            prediction = containers[0].get_text().strip()
        
        # If no prediction was found, use the first few paragraphs from text content
        if not prediction and text_content:
            paragraphs = text_content.split('\n\n')
            prediction = paragraphs[0] if paragraphs else text_content[:500]
            
        return prediction or "No prediction available"
    
    def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
        """
        Extract horoscope date.
        Generic implementation - should be overridden in specific scrapers.
        """
        # Look for date in URL
        date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
        if date_match:
            return date_match.group(1)
            
        # Look for date in common elements
        date_elements = soup.select('.horoscope-date, .date, time')
        if date_elements:
            date_text = date_elements[0].get_text().strip()
            # Try to parse various date formats
            try:
                # Try common formats
                for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']:
                    try:
                        parsed_date = datetime.strptime(date_text, fmt)
                        return parsed_date.strftime('%Y-%m-%d')
                    except ValueError:
                        continue
            except Exception:
                pass
        
        # Default to today's date if no date found
        return date.today().isoformat()
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed_url = urlparse(url)
            return parsed_url.netloc
        except Exception:
            return "Unknown Source"