import logging import re from typing import Dict, Any, List, Optional from datetime import datetime, date from bs4 import BeautifulSoup import requests from .base_scraper import BaseScraper logger = logging.getLogger(__name__) class HoroscopeScraper(BaseScraper): """Scraper for horoscope websites""" # List of valid zodiac signs ZODIAC_SIGNS = [ "aries", "taurus", "gemini", "cancer", "leo", "virgo", "libra", "scorpio", "sagittarius", "capricorn", "aquarius", "pisces" ] def __init__(self, timeout: int = 30): super().__init__(timeout) self.source_name = "Generic Horoscope Site" def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]: """ Scrape horoscopes for all zodiac signs Args: base_url: Base URL for the horoscope site date_str: Optional date string in format YYYY-MM-DD Returns: List of dictionaries with horoscope data for each sign """ results = [] for sign in self.ZODIAC_SIGNS: try: horoscope_data = self.scrape_sign(base_url, sign, date_str) if horoscope_data and horoscope_data.get('success', False): results.append(horoscope_data) except Exception as e: logger.error(f"Error scraping {sign} horoscope: {str(e)}") return results def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]: """ Scrape horoscope for a specific zodiac sign Args: base_url: Base URL for the horoscope site sign: Zodiac sign (lowercase) date_str: Optional date string in format YYYY-MM-DD Returns: Dictionary with horoscope data """ # Validate sign if sign.lower() not in self.ZODIAC_SIGNS: return {"success": False, "error": f"Invalid zodiac sign: {sign}"} # Format the URL for the specific sign url = self._format_url(base_url, sign, date_str) # Use the base scraper method to get the content result = self.scrape(url) # Add additional horoscope-specific metadata result["sign"] = sign.lower() result["scraped_date"] = date_str if date_str else date.today().isoformat() result["source_name"] = self.source_name return result def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: """Parse horoscope content and extract structured data""" try: soup = BeautifulSoup(html_content, 'html.parser') # Extract metadata - this is a generic implementation # Specific horoscope sites will need custom implementations result = { "type": "horoscope", "prediction": self._extract_prediction(soup, text_content), "date": self._extract_date(soup, url), "source": self._extract_domain(url), } return result except Exception as e: logger.error(f"Error parsing horoscope content: {str(e)}") return {"type": "horoscope", "error_parsing": str(e)} def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str: """ Format URL for horoscope site. This is a generic implementation. Should be overridden in specific scrapers. """ # Default implementation just appends the sign to the base URL return f"{base_url.rstrip('/')}/{sign.lower()}" def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str: """ Extract horoscope prediction text. Generic implementation - should be overridden in specific scrapers. """ # Default implementation just returns the first paragraph or the text content prediction = "" # Look for common horoscope content containers containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p') if containers: prediction = containers[0].get_text().strip() # If no prediction was found, use the first few paragraphs from text content if not prediction and text_content: paragraphs = text_content.split('\n\n') prediction = paragraphs[0] if paragraphs else text_content[:500] return prediction or "No prediction available" def _extract_date(self, soup: BeautifulSoup, url: str) -> str: """ Extract horoscope date. Generic implementation - should be overridden in specific scrapers. """ # Look for date in URL date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url) if date_match: return date_match.group(1) # Look for date in common elements date_elements = soup.select('.horoscope-date, .date, time') if date_elements: date_text = date_elements[0].get_text().strip() # Try to parse various date formats try: # Try common formats for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']: try: parsed_date = datetime.strptime(date_text, fmt) return parsed_date.strftime('%Y-%m-%d') except ValueError: continue except Exception: pass # Default to today's date if no date found return date.today().isoformat() def _extract_domain(self, url: str) -> str: """Extract domain from URL""" try: from urllib.parse import urlparse parsed_url = urlparse(url) return parsed_url.netloc except Exception: return "Unknown Source"