MT564AITraining / scrapers /astrology_com_scraper.py
pareshmishra
Add full project source files for MT564 AI
2c72e40
import logging
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from datetime import datetime, date
from .horoscope_scraper import HoroscopeScraper
logger = logging.getLogger(__name__)
class AstrologyComScraper(HoroscopeScraper):
"""Scraper for Astrology.com horoscopes"""
def __init__(self, timeout: int = 30):
super().__init__(timeout)
self.source_name = "Astrology.com"
self.base_url = "https://www.astrology.com/horoscope/daily"
def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
"""Format URL for astrology.com"""
# For astrology.com, the format is typically:
# https://www.astrology.com/horoscope/daily/[sign].html
# or for specific dates:
# https://www.astrology.com/horoscope/daily/[date]/[sign].html
if date_str:
try:
# Convert YYYY-MM-DD to the format needed (YYYY-MM-DD)
formatted_date = date_str
return f"{base_url}/{formatted_date}/{sign.lower()}.html"
except Exception as e:
logger.error(f"Error formatting date: {str(e)}")
# Default to current date if no date provided
return f"{base_url}/{sign.lower()}.html"
def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
"""Extract horoscope prediction from astrology.com"""
# Astrology.com typically has the horoscope in a div with class containing 'daily-horoscope'
prediction_div = soup.select('div[class*="daily-horoscope"] p, .main-horoscope p')
if prediction_div:
# Join all paragraphs in the prediction div
prediction = ' '.join([p.get_text().strip() for p in prediction_div])
return prediction
# Fallback to generic extraction
return super()._extract_prediction(soup, text_content)
def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
"""Extract horoscope date from astrology.com"""
# Try to get date from URL first
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
if date_match:
return date_match.group(1)
# Look for date in typical location
date_div = soup.select('.date-selector h2, .horoscope-date')
if date_div:
date_text = date_div[0].get_text().strip()
# Try to parse date (format typically like "May 13, 2025")
try:
parsed_date = datetime.strptime(date_text, '%B %d, %Y')
return parsed_date.strftime('%Y-%m-%d')
except ValueError:
# Try alternative format
try:
# Try to match month day, year pattern
match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text)
if match:
month, day, year = match.groups()
month_dict = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12
}
month_num = month_dict.get(month.lower(), 1)
parsed_date = datetime(int(year), month_num, int(day))
return parsed_date.strftime('%Y-%m-%d')
except Exception:
pass
# Default to today's date if no date found
return date.today().isoformat()