Spaces:
Running
Running
import logging | |
import re | |
from typing import Dict, Any, List, Optional | |
from datetime import datetime, date | |
from bs4 import BeautifulSoup | |
import requests | |
from .base_scraper import BaseScraper | |
logger = logging.getLogger(__name__) | |
class HoroscopeScraper(BaseScraper): | |
"""Scraper for horoscope websites""" | |
# List of valid zodiac signs | |
ZODIAC_SIGNS = [ | |
"aries", "taurus", "gemini", "cancer", | |
"leo", "virgo", "libra", "scorpio", | |
"sagittarius", "capricorn", "aquarius", "pisces" | |
] | |
def __init__(self, timeout: int = 30): | |
super().__init__(timeout) | |
self.source_name = "Generic Horoscope Site" | |
def scrape_all_signs(self, base_url: str, date_str: Optional[str] = None) -> List[Dict[str, Any]]: | |
""" | |
Scrape horoscopes for all zodiac signs | |
Args: | |
base_url: Base URL for the horoscope site | |
date_str: Optional date string in format YYYY-MM-DD | |
Returns: | |
List of dictionaries with horoscope data for each sign | |
""" | |
results = [] | |
for sign in self.ZODIAC_SIGNS: | |
try: | |
horoscope_data = self.scrape_sign(base_url, sign, date_str) | |
if horoscope_data and horoscope_data.get('success', False): | |
results.append(horoscope_data) | |
except Exception as e: | |
logger.error(f"Error scraping {sign} horoscope: {str(e)}") | |
return results | |
def scrape_sign(self, base_url: str, sign: str, date_str: Optional[str] = None) -> Dict[str, Any]: | |
""" | |
Scrape horoscope for a specific zodiac sign | |
Args: | |
base_url: Base URL for the horoscope site | |
sign: Zodiac sign (lowercase) | |
date_str: Optional date string in format YYYY-MM-DD | |
Returns: | |
Dictionary with horoscope data | |
""" | |
# Validate sign | |
if sign.lower() not in self.ZODIAC_SIGNS: | |
return {"success": False, "error": f"Invalid zodiac sign: {sign}"} | |
# Format the URL for the specific sign | |
url = self._format_url(base_url, sign, date_str) | |
# Use the base scraper method to get the content | |
result = self.scrape(url) | |
# Add additional horoscope-specific metadata | |
result["sign"] = sign.lower() | |
result["scraped_date"] = date_str if date_str else date.today().isoformat() | |
result["source_name"] = self.source_name | |
return result | |
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: | |
"""Parse horoscope content and extract structured data""" | |
try: | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Extract metadata - this is a generic implementation | |
# Specific horoscope sites will need custom implementations | |
result = { | |
"type": "horoscope", | |
"prediction": self._extract_prediction(soup, text_content), | |
"date": self._extract_date(soup, url), | |
"source": self._extract_domain(url), | |
} | |
return result | |
except Exception as e: | |
logger.error(f"Error parsing horoscope content: {str(e)}") | |
return {"type": "horoscope", "error_parsing": str(e)} | |
def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str: | |
""" | |
Format URL for horoscope site. This is a generic implementation. | |
Should be overridden in specific scrapers. | |
""" | |
# Default implementation just appends the sign to the base URL | |
return f"{base_url.rstrip('/')}/{sign.lower()}" | |
def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str: | |
""" | |
Extract horoscope prediction text. | |
Generic implementation - should be overridden in specific scrapers. | |
""" | |
# Default implementation just returns the first paragraph or the text content | |
prediction = "" | |
# Look for common horoscope content containers | |
containers = soup.select('.horoscope-content, .prediction, .horoscope-prediction, article p') | |
if containers: | |
prediction = containers[0].get_text().strip() | |
# If no prediction was found, use the first few paragraphs from text content | |
if not prediction and text_content: | |
paragraphs = text_content.split('\n\n') | |
prediction = paragraphs[0] if paragraphs else text_content[:500] | |
return prediction or "No prediction available" | |
def _extract_date(self, soup: BeautifulSoup, url: str) -> str: | |
""" | |
Extract horoscope date. | |
Generic implementation - should be overridden in specific scrapers. | |
""" | |
# Look for date in URL | |
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', url) | |
if date_match: | |
return date_match.group(1) | |
# Look for date in common elements | |
date_elements = soup.select('.horoscope-date, .date, time') | |
if date_elements: | |
date_text = date_elements[0].get_text().strip() | |
# Try to parse various date formats | |
try: | |
# Try common formats | |
for fmt in ['%Y-%m-%d', '%B %d, %Y', '%d %B %Y', '%m/%d/%Y', '%d/%m/%Y']: | |
try: | |
parsed_date = datetime.strptime(date_text, fmt) | |
return parsed_date.strftime('%Y-%m-%d') | |
except ValueError: | |
continue | |
except Exception: | |
pass | |
# Default to today's date if no date found | |
return date.today().isoformat() | |
def _extract_domain(self, url: str) -> str: | |
"""Extract domain from URL""" | |
try: | |
from urllib.parse import urlparse | |
parsed_url = urlparse(url) | |
return parsed_url.netloc | |
except Exception: | |
return "Unknown Source" |