import logging import re from typing import Dict, Any, List from bs4 import BeautifulSoup from .base_scraper import BaseScraper logger = logging.getLogger(__name__) class BlogScraper(BaseScraper): """Scraper for blog websites""" def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: """Parse blog content and extract structured data""" try: soup = BeautifulSoup(html_content, 'html.parser') # Extract metadata result = { "type": "blog", "title": self._extract_title(soup), "publish_date": self._extract_publish_date(soup), "author": self._extract_author(soup), "categories": self._extract_categories(soup), "tags": self._extract_tags(soup), "summary": self._extract_summary(text_content), "source": self._extract_domain(url), } return result except Exception as e: logger.error(f"Error parsing blog content: {str(e)}") return {"type": "blog", "error_parsing": str(e)} def _extract_title(self, soup: BeautifulSoup) -> str: """Extract title from blog post""" # Try different methods to find title title = None # Method 1: Look for

tags in article or entry article = soup.find(['article', 'div'], class_=re.compile('(post|entry|article)')) if article: h1 = article.find('h1') if h1: title = h1.get_text().strip() # Method 2: Look for any h1 if above failed if not title: h1_tags = soup.find_all('h1') if h1_tags and len(h1_tags) > 0: title = h1_tags[0].get_text().strip() # Method 3: Look for blog titles in meta tags if not title: og_title = soup.find('meta', property='og:title') if og_title and og_title.get('content'): title = og_title['content'].strip() return title or "Unknown Title" def _extract_publish_date(self, soup: BeautifulSoup) -> str: """Extract publication date""" # Try various methods to find date date = None # Method 1: Look for common date meta tags date_meta = soup.find('meta', property='article:published_time') if date_meta and date_meta.get('content'): date = date_meta['content'] # Method 2: Look for common blog date classes if not date: date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta'] for class_name in date_classes: date_element = soup.find(class_=re.compile(class_name, re.I)) if date_element: date = date_element.get_text().strip() break return date or "Unknown Date" def _extract_author(self, soup: BeautifulSoup) -> str: """Extract author information""" # Try various methods to find author author = None # Method 1: Look for author meta tags author_meta = soup.find('meta', property='article:author') if author_meta and author_meta.get('content'): author = author_meta['content'] # Method 2: Look for blog-specific author classes if not author: author_classes = ['author', 'byline', 'entry-author', 'post-author'] for class_name in author_classes: author_element = soup.find(class_=re.compile(class_name, re.I)) if author_element: author = author_element.get_text().strip() break return author or "Unknown Author" def _extract_categories(self, soup: BeautifulSoup) -> List[str]: """Extract blog post categories""" categories = [] # Method 1: Look for category links category_elements = soup.find_all('a', class_=re.compile('category')) if category_elements: for element in category_elements: cat_text = element.get_text().strip() if cat_text and cat_text not in categories: categories.append(cat_text) # Method 2: Look for category meta tag if not categories: category_meta = soup.find('meta', property='article:section') if category_meta and category_meta.get('content'): categories.append(category_meta['content'].strip()) return categories def _extract_tags(self, soup: BeautifulSoup) -> List[str]: """Extract blog post tags""" tags = [] # Look for tag links tag_elements = soup.find_all('a', class_=re.compile('tag')) if tag_elements: for element in tag_elements: tag_text = element.get_text().strip() if tag_text and tag_text not in tags: tags.append(tag_text) return tags def _extract_summary(self, text_content: str) -> str: """Extract or create a summary from the blog post text""" if not text_content: return "No summary available" # Take first paragraph or first few sentences (up to 300 chars) paragraphs = text_content.split('\n\n') if paragraphs: summary = paragraphs[0] if len(summary) > 300: summary = summary[:297] + "..." return summary return "No summary available" def _extract_domain(self, url: str) -> str: """Extract domain from URL""" try: from urllib.parse import urlparse parsed_url = urlparse(url) return parsed_url.netloc except Exception: return "Unknown Source"