MT564AITraining / scrapers /horoscope_scraper.py
pareshmishra
Add full project source files for MT564 AI
2c72e40
import logging
import re
from typing import Dict, Any, List, Optional
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
from .base_scraper import BaseScraper
logger = logging.getLogger(__name__)
class HoroscopeScraper(BaseScraper):
"""Scraper for horoscope websites"""
# List of valid zodiac signs
ZODIAC_SIGNS = [
"aries", "taurus", "gemini", "cancer",
"leo", "virgo", "libra", "scorpio",
"sagittarius", "capricorn", "aquarius", "pisces"
]
def __init__(self, timeout: int = 30):
super().__init__(timeout)
self.source_name = "Generic Horoscope Site"
def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Scrape horoscopes for all zodiac signs
Args:
base_url: Base URL for the horoscope site
date_str: Optional date string in format YYYY-MM-DD
Returns:
List of dictionaries with horoscope data for each sign
"""
results = []
for sign in self.ZODIAC_SIGNS:
try:
horoscope_data = self.scrape_sign(base_url, sign, date_str)
if horoscope_data and horoscope_data.get('success', False):
results.append(horoscope_data)
except Exception as e:
logger.error(f"Error scraping {sign} horoscope: {str(e)}")
return results
def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]:
"""
Scrape horoscope for a specific zodiac sign
Args:
base_url: Base URL for the horoscope site
sign: Zodiac sign (lowercase)
date_str: Optional date string in format YYYY-MM-DD
Returns:
Dictionary with horoscope data
"""
# Validate sign
if sign.lower() not in self.ZODIAC_SIGNS:
return {"success": False, "error": f"Invalid zodiac sign: {sign}"}
# Format the URL for the specific sign
url = self._format_url(base_url, sign, date_str)
# Use the base scraper method to get the content
result = self.scrape(url)
# Add additional horoscope-specific metadata
result["sign"] = sign.lower()
result["scraped_date"] = date_str if date_str else date.today().isoformat()
result["source_name"] = self.source_name
return result
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse horoscope content and extract structured data"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract metadata - this is a generic implementation
# Specific horoscope sites will need custom implementations
result = {
"type": "horoscope",
"prediction": self._extract_prediction(soup, text_content),
"date": self._extract_date(soup, url),
"source": self._extract_domain(url),
}
return result
except Exception as e:
logger.error(f"Error parsing horoscope content: {str(e)}")
return {"type": "horoscope", "error_parsing": str(e)}
def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
"""
Format URL for horoscope site. This is a generic implementation.
Should be overridden in specific scrapers.
"""
# Default implementation just appends the sign to the base URL
return f"{base_url.rstrip('/')}/{sign.lower()}"
def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
"""
Extract horoscope prediction text.
Generic implementation - should be overridden in specific scrapers.
"""
# Default implementation just returns the first paragraph or the text content
prediction = ""
# Look for common horoscope content containers
containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p')
if containers:
prediction = containers[0].get_text().strip()
# If no prediction was found, use the first few paragraphs from text content
if not prediction and text_content:
paragraphs = text_content.split('\n\n')
prediction = paragraphs[0] if paragraphs else text_content[:500]
return prediction or "No prediction available"
def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
"""
Extract horoscope date.
Generic implementation - should be overridden in specific scrapers.
"""
# Look for date in URL
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url)
if date_match:
return date_match.group(1)
# Look for date in common elements
date_elements = soup.select('.horoscope-date, .date, time')
if date_elements:
date_text = date_elements[0].get_text().strip()
# Try to parse various date formats
try:
# Try common formats
for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']:
try:
parsed_date = datetime.strptime(date_text, fmt)
return parsed_date.strftime('%Y-%m-%d')
except ValueError:
continue
except Exception:
pass
# Default to today's date if no date found
return date.today().isoformat()
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
try:
from urllib.parse import urlparse
parsed_url = urlparse(url)
return parsed_url.netloc
except Exception:
return "Unknown Source"