Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /horoscope_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 27 days ago

6.16 kB

	import logging
	import re
	from typing import Dict, Any, List, Optional
	from datetime import datetime, date
	from bs4 import BeautifulSoup
	import requests
	from .base_scraper import BaseScraper

	logger = logging.getLogger(__name__)

	class HoroscopeScraper(BaseScraper):
	"""Scraper for horoscope websites"""

	# List of valid zodiac signs
	ZODIAC_SIGNS = [
	"aries", "taurus", "gemini", "cancer",
	"leo", "virgo", "libra", "scorpio",
	"sagittarius", "capricorn", "aquarius", "pisces"
	]

	def __init__(self, timeout: int = 30):
	super().__init__(timeout)
	self.source_name = "Generic Horoscope Site"

	def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]:
	"""
	Scrape horoscopes for all zodiac signs

	Args:
	base_url: Base URL for the horoscope site
	date_str: Optional date string in format YYYY-MM-DD

	Returns:
	List of dictionaries with horoscope data for each sign
	"""
	results = []

	for sign in self.ZODIAC_SIGNS:
	try:
	horoscope_data = self.scrape_sign(base_url, sign, date_str)
	if horoscope_data and horoscope_data.get('success', False):
	results.append(horoscope_data)
	except Exception as e:
	logger.error(f"Error scraping {sign} horoscope: {str(e)}")

	return results

	def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]:
	"""
	Scrape horoscope for a specific zodiac sign

	Args:
	base_url: Base URL for the horoscope site
	sign: Zodiac sign (lowercase)
	date_str: Optional date string in format YYYY-MM-DD

	Returns:
	Dictionary with horoscope data
	"""
	# Validate sign
	if sign.lower() not in self.ZODIAC_SIGNS:
	return {"success": False, "error": f"Invalid zodiac sign: {sign}"}

	# Format the URL for the specific sign
	url = self._format_url(base_url, sign, date_str)

	# Use the base scraper method to get the content
	result = self.scrape(url)

	# Add additional horoscope-specific metadata
	result["sign"] = sign.lower()
	result["scraped_date"] = date_str if date_str else date.today().isoformat()
	result["source_name"] = self.source_name

	return result

	def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
	"""Parse horoscope content and extract structured data"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract metadata - this is a generic implementation
	# Specific horoscope sites will need custom implementations
	result = {
	"type": "horoscope",
	"prediction": self._extract_prediction(soup, text_content),
	"date": self._extract_date(soup, url),
	"source": self._extract_domain(url),
	}

	return result
	except Exception as e:
	logger.error(f"Error parsing horoscope content: {str(e)}")
	return {"type": "horoscope", "error_parsing": str(e)}

	def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
	"""
	Format URL for horoscope site. This is a generic implementation.
	Should be overridden in specific scrapers.
	"""
	# Default implementation just appends the sign to the base URL
	return f"{base_url.rstrip('/')}/{sign.lower()}"

	def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
	"""
	Extract horoscope prediction text.
	Generic implementation - should be overridden in specific scrapers.
	"""
	# Default implementation just returns the first paragraph or the text content
	prediction = ""

	# Look for common horoscope content containers
	containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p')
	if containers:
	prediction = containers[0].get_text().strip()

	# If no prediction was found, use the first few paragraphs from text content
	if not prediction and text_content:
	paragraphs = text_content.split('\n\n')
	prediction = paragraphs[0] if paragraphs else text_content[:500]

	return prediction or "No prediction available"

	def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
	"""
	Extract horoscope date.
	Generic implementation - should be overridden in specific scrapers.
	"""
	# Look for date in URL
	date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
	if date_match:
	return date_match.group(1)

	# Look for date in common elements
	date_elements = soup.select('.horoscope-date, .date, time')
	if date_elements:
	date_text = date_elements[0].get_text().strip()
	# Try to parse various date formats
	try:
	# Try common formats
	for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']:
	try:
	parsed_date = datetime.strptime(date_text, fmt)
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	continue
	except Exception:
	pass

	# Default to today's date if no date found
	return date.today().isoformat()

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL"""
	try:
	from urllib.parse import urlparse
	parsed_url = urlparse(url)
	return parsed_url.netloc
	except Exception:
	return "Unknown Source"