Spaces:

pareshmishra
/

MT564AITraining

Running

File size: 3,675 Bytes

2c72e40

import logging
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from datetime import datetime, date
from .horoscope_scraper import HoroscopeScraper

logger = logging.getLogger(__name__)

class AstrologyComScraper(HoroscopeScraper):
    """Scraper for Astrology.com horoscopes"""
    
    def __init__(self, timeout: int = 30):
        super().__init__(timeout)
        self.source_name = "Astrology.com"
        self.base_url = "https://www.astrology.com/horoscope/daily"
    
    def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
        """Format URL for astrology.com"""
        # For astrology.com, the format is typically:
        # https://www.astrology.com/horoscope/daily/[sign].html
        # or for specific dates:
        # https://www.astrology.com/horoscope/daily/[date]/[sign].html
        
        if date_str:
            try:
                # Convert YYYY-MM-DD to the format needed (YYYY-MM-DD)
                formatted_date = date_str
                return f"{base_url}/{formatted_date}/{sign.lower()}.html"
            except Exception as e:
                logger.error(f"Error formatting date: {str(e)}")
        
        # Default to current date if no date provided
        return f"{base_url}/{sign.lower()}.html"
    
    def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
        """Extract horoscope prediction from astrology.com"""
        # Astrology.com typically has the horoscope in a div with class containing 'daily-horoscope'
        prediction_div = soup.select('div[class*="daily-horoscope"] p, .main-horoscope p')
        
        if prediction_div:
            # Join all paragraphs in the prediction div
            prediction = ' '.join([p.get_text().strip() for p in prediction_div])
            return prediction
        
        # Fallback to generic extraction
        return super()._extract_prediction(soup, text_content)
    
    def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
        """Extract horoscope date from astrology.com"""
        # Try to get date from URL first
        date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
        if date_match:
            return date_match.group(1)
        
        # Look for date in typical location
        date_div = soup.select('.date-selector h2, .horoscope-date')
        if date_div:
            date_text = date_div[0].get_text().strip()
            # Try to parse date (format typically like "May 13, 2025")
            try:
                parsed_date = datetime.strptime(date_text, '%B %d, %Y')
                return parsed_date.strftime('%Y-%m-%d')
            except ValueError:
                # Try alternative format
                try:
                    # Try to match month day, year pattern
                    match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text)
                    if match:
                        month, day, year = match.groups()
                        month_dict = {
                            'january': 1, 'february': 2, 'march': 3, 'april': 4,
                            'may': 5, 'june': 6, 'july': 7, 'august': 8,
                            'september': 9, 'october': 10, 'november': 11, 'december': 12
                        }
                        month_num = month_dict.get(month.lower(), 1)
                        parsed_date = datetime(int(year), month_num, int(day))
                        return parsed_date.strftime('%Y-%m-%d')
                except Exception:
                    pass
        
        # Default to today's date if no date found
        return date.today().isoformat()