import logging import re from typing import Dict, Any, Optional from bs4 import BeautifulSoup from datetime import datetime, date from .horoscope_scraper import HoroscopeScraper logger = logging.getLogger(__name__) class AstrologyComScraper(HoroscopeScraper): """Scraper for Astrology.com horoscopes""" def __init__(self, timeout: int = 30): super().__init__(timeout) self.source_name = "Astrology.com" self.base_url = "https://www.astrology.com/horoscope/daily" def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str: """Format URL for astrology.com""" # For astrology.com, the format is typically: # https://www.astrology.com/horoscope/daily/[sign].html # or for specific dates: # https://www.astrology.com/horoscope/daily/[date]/[sign].html if date_str: try: # Convert YYYY-MM-DD to the format needed (YYYY-MM-DD) formatted_date = date_str return f"{base_url}/{formatted_date}/{sign.lower()}.html" except Exception as e: logger.error(f"Error formatting date: {str(e)}") # Default to current date if no date provided return f"{base_url}/{sign.lower()}.html" def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str: """Extract horoscope prediction from astrology.com""" # Astrology.com typically has the horoscope in a div with class containing 'daily-horoscope' prediction_div = soup.select('div[class*="daily-horoscope"] p, .main-horoscope p') if prediction_div: # Join all paragraphs in the prediction div prediction = ' '.join([p.get_text().strip() for p in prediction_div]) return prediction # Fallback to generic extraction return super()._extract_prediction(soup, text_content) def _extract_date(self, soup: BeautifulSoup, url: str) -> str: """Extract horoscope date from astrology.com""" # Try to get date from URL first date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url) if date_match: return date_match.group(1) # Look for date in typical location date_div = soup.select('.date-selector h2, .horoscope-date') if date_div: date_text = date_div[0].get_text().strip() # Try to parse date (format typically like "May 13, 2025") try: parsed_date = datetime.strptime(date_text, '%B %d, %Y') return parsed_date.strftime('%Y-%m-%d') except ValueError: # Try alternative format try: # Try to match month day, year pattern match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text) if match: month, day, year = match.groups() month_dict = { 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12 } month_num = month_dict.get(month.lower(), 1) parsed_date = datetime(int(year), month_num, int(day)) return parsed_date.strftime('%Y-%m-%d') except Exception: pass # Default to today's date if no date found return date.today().isoformat()