Spaces:

pareshmishra
/

MT564AITraining

Running

File size: 6,155 Bytes

2c72e40

import logging
import re
from typing import Dict, Any, List, Optional
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
from .base_scraper import BaseScraper

logger = logging.getLogger(__name__)

class HoroscopeScraper(BaseScraper):
    """Scraper for horoscope websites"""
    
    # List of valid zodiac signs
    ZODIAC_SIGNS = [
        "aries", "taurus", "gemini", "cancer", 
        "leo", "virgo", "libra", "scorpio", 
        "sagittarius", "capricorn", "aquarius", "pisces"
    ]
    
    def __init__(self, timeout: int = 30):
        super().__init__(timeout)
        self.source_name = "Generic Horoscope Site"
    
    def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]:
        """
        Scrape horoscopes for all zodiac signs
        
        Args:
            base_url: Base URL for the horoscope site
            date_str: Optional date string in format YYYY-MM-DD
            
        Returns:
            List of dictionaries with horoscope data for each sign
        """
        results = []
        
        for sign in self.ZODIAC_SIGNS:
            try:
                horoscope_data = self.scrape_sign(base_url, sign, date_str)
                if horoscope_data and horoscope_data.get('success', False):
                    results.append(horoscope_data)
            except Exception as e:
                logger.error(f"Error scraping {sign} horoscope: {str(e)}")
        
        return results
    
    def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]:
        """
        Scrape horoscope for a specific zodiac sign
        
        Args:
            base_url: Base URL for the horoscope site
            sign: Zodiac sign (lowercase)
            date_str: Optional date string in format YYYY-MM-DD
            
        Returns:
            Dictionary with horoscope data
        """
        # Validate sign
        if sign.lower() not in self.ZODIAC_SIGNS:
            return {"success": False, "error": f"Invalid zodiac sign: {sign}"}
        
        # Format the URL for the specific sign
        url = self._format_url(base_url, sign, date_str)
        
        # Use the base scraper method to get the content
        result = self.scrape(url)
        
        # Add additional horoscope-specific metadata
        result["sign"] = sign.lower()
        result["scraped_date"] = date_str if date_str else date.today().isoformat()
        result["source_name"] = self.source_name
        
        return result
    
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse horoscope content and extract structured data"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract metadata - this is a generic implementation
            # Specific horoscope sites will need custom implementations
            result = {
                "type": "horoscope",
                "prediction": self._extract_prediction(soup, text_content),
                "date": self._extract_date(soup, url),
                "source": self._extract_domain(url),
            }
            
            return result
        except Exception as e:
            logger.error(f"Error parsing horoscope content: {str(e)}")
            return {"type": "horoscope", "error_parsing": str(e)}
    
    def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
        """
        Format URL for horoscope site. This is a generic implementation.
        Should be overridden in specific scrapers.
        """
        # Default implementation just appends the sign to the base URL
        return f"{base_url.rstrip('/')}/{sign.lower()}"
    
    def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
        """
        Extract horoscope prediction text.
        Generic implementation - should be overridden in specific scrapers.
        """
        # Default implementation just returns the first paragraph or the text content
        prediction = ""
        
        # Look for common horoscope content containers
        containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p')
        if containers:
            prediction = containers[0].get_text().strip()
        
        # If no prediction was found, use the first few paragraphs from text content
        if not prediction and text_content:
            paragraphs = text_content.split('\n\n')
            prediction = paragraphs[0] if paragraphs else text_content[:500]
            
        return prediction or "No prediction available"
    
    def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
        """
        Extract horoscope date.
        Generic implementation - should be overridden in specific scrapers.
        """
        # Look for date in URL
        date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
        if date_match:
            return date_match.group(1)
            
        # Look for date in common elements
        date_elements = soup.select('.horoscope-date, .date, time')
        if date_elements:
            date_text = date_elements[0].get_text().strip()
            # Try to parse various date formats
            try:
                # Try common formats
                for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']:
                    try:
                        parsed_date = datetime.strptime(date_text, fmt)
                        return parsed_date.strftime('%Y-%m-%d')
                    except ValueError:
                        continue
            except Exception:
                pass
        
        # Default to today's date if no date found
        return date.today().isoformat()
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed_url = urlparse(url)
            return parsed_url.netloc
        except Exception:
            return "Unknown Source"