Spaces:
Running
Running
File size: 4,029 Bytes
2c72e40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import logging
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from datetime import datetime, date
from .horoscope_scraper import HoroscopeScraper
logger = logging.getLogger(__name__)
class HoroscopeComScraper(HoroscopeScraper):
"""Scraper for Horoscope.com daily horoscopes"""
def __init__(self, timeout: int = 30):
super().__init__(timeout)
self.source_name = "Horoscope.com"
self.base_url = "https://www.horoscope.com/us/horoscopes/general/horoscope-general-daily-today.aspx"
def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
"""Format URL for horoscope.com"""
# Map zodiac signs to their numeric ids used by horoscope.com
sign_ids = {
"aries": 1, "taurus": 2, "gemini": 3, "cancer": 4,
"leo": 5, "virgo": 6, "libra": 7, "scorpio": 8,
"sagittarius": 9, "capricorn": 10, "aquarius": 11, "pisces": 12
}
sign_id = sign_ids.get(sign.lower(), 1)
if date_str:
try:
# Convert YYYY-MM-DD to the format needed (YYYYMMDD)
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
formatted_date = date_obj.strftime('%Y%m%d')
return f"{self.base_url}?sign={sign_id}&laDate={formatted_date}"
except Exception as e:
logger.error(f"Error formatting date: {str(e)}")
# Default to current date if no date provided
return f"{self.base_url}?sign={sign_id}"
def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
"""Extract horoscope prediction from horoscope.com"""
# Horoscope.com typically has the horoscope in a div with class 'main-horoscope'
prediction_div = soup.select('.main-horoscope p')
if prediction_div:
# Join all paragraphs in the prediction div
prediction = ' '.join([p.get_text().strip() for p in prediction_div])
return prediction
# Alternative selector
alt_div = soup.select('#textline')
if alt_div:
return alt_div[0].get_text().strip()
# Fallback to generic extraction
return super()._extract_prediction(soup, text_content)
def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
"""Extract horoscope date from horoscope.com"""
# Try to get date from URL first (in the laDate parameter)
date_match = re.search(r'laDate=(\d{8})', url)
if date_match:
date_str = date_match.group(1)
try:
parsed_date = datetime.strptime(date_str, '%Y%m%d')
return parsed_date.strftime('%Y-%m-%d')
except ValueError:
pass
# Look for date in typical location
date_div = soup.select('.main-horoscope h1, .main-horoscope h2')
if date_div:
date_text = date_div[0].get_text().strip()
# Try to extract date (format typically like "Taurus Daily Horoscope for May 13, 2025")
try:
# Try to match month day, year pattern
match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text)
if match:
month, day, year = match.groups()
month_dict = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12
}
month_num = month_dict.get(month.lower(), 1)
parsed_date = datetime(int(year), month_num, int(day))
return parsed_date.strftime('%Y-%m-%d')
except Exception:
pass
# Default to today's date if no date found
return date.today().isoformat() |