Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /astrology_com_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 27 days ago

3.68 kB

	import logging
	import re
	from typing import Dict, Any, Optional
	from bs4 import BeautifulSoup
	from datetime import datetime, date
	from .horoscope_scraper import HoroscopeScraper

	logger = logging.getLogger(__name__)

	class AstrologyComScraper(HoroscopeScraper):
	"""Scraper for Astrology.com horoscopes"""

	def __init__(self, timeout: int = 30):
	super().__init__(timeout)
	self.source_name = "Astrology.com"
	self.base_url = "https://www.astrology.com/horoscope/daily"

	def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
	"""Format URL for astrology.com"""
	# For astrology.com, the format is typically:
	# https://www.astrology.com/horoscope/daily/[sign].html
	# or for specific dates:
	# https://www.astrology.com/horoscope/daily/[date]/[sign].html

	if date_str:
	try:
	# Convert YYYY-MM-DD to the format needed (YYYY-MM-DD)
	formatted_date = date_str
	return f"{base_url}/{formatted_date}/{sign.lower()}.html"
	except Exception as e:
	logger.error(f"Error formatting date: {str(e)}")

	# Default to current date if no date provided
	return f"{base_url}/{sign.lower()}.html"

	def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
	"""Extract horoscope prediction from astrology.com"""
	# Astrology.com typically has the horoscope in a div with class containing 'daily-horoscope'
	prediction_div = soup.select('div[class*="daily-horoscope"] p, .main-horoscope p')

	if prediction_div:
	# Join all paragraphs in the prediction div
	prediction = ' '.join([p.get_text().strip() for p in prediction_div])
	return prediction

	# Fallback to generic extraction
	return super()._extract_prediction(soup, text_content)

	def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
	"""Extract horoscope date from astrology.com"""
	# Try to get date from URL first
	date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
	if date_match:
	return date_match.group(1)

	# Look for date in typical location
	date_div = soup.select('.date-selector h2, .horoscope-date')
	if date_div:
	date_text = date_div[0].get_text().strip()
	# Try to parse date (format typically like "May 13, 2025")
	try:
	parsed_date = datetime.strptime(date_text, '%B %d, %Y')
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	# Try alternative format
	try:
	# Try to match month day, year pattern
	match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text)
	if match:
	month, day, year = match.groups()
	month_dict = {
	'january': 1, 'february': 2, 'march': 3, 'april': 4,
	'may': 5, 'june': 6, 'july': 7, 'august': 8,
	'september': 9, 'october': 10, 'november': 11, 'december': 12
	}
	month_num = month_dict.get(month.lower(), 1)
	parsed_date = datetime(int(year), month_num, int(day))
	return parsed_date.strftime('%Y-%m-%d')
	except Exception:
	pass

	# Default to today's date if no date found
	return date.today().isoformat()