MT564AITraining / scrapers /blog_scraper.py
pareshmishra
Add full project source files for MT564 AI
2c72e40
import logging
import re
from typing import Dict, Any, List
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper
logger = logging.getLogger(__name__)
class BlogScraper(BaseScraper):
"""Scraper for blog websites"""
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse blog content and extract structured data"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract metadata
result = {
"type": "blog",
"title": self._extract_title(soup),
"publish_date": self._extract_publish_date(soup),
"author": self._extract_author(soup),
"categories": self._extract_categories(soup),
"tags": self._extract_tags(soup),
"summary": self._extract_summary(text_content),
"source": self._extract_domain(url),
}
return result
except Exception as e:
logger.error(f"Error parsing blog content: {str(e)}")
return {"type": "blog", "error_parsing": str(e)}
def _extract_title(self, soup: BeautifulSoup) -> str:
"""Extract title from blog post"""
# Try different methods to find title
title = None
# Method 1: Look for <h1> tags in article or entry
article = soup.find(['article', 'div'], class_=re.compile('(post|entry|article)'))
if article:
h1 = article.find('h1')
if h1:
title = h1.get_text().strip()
# Method 2: Look for any h1 if above failed
if not title:
h1_tags = soup.find_all('h1')
if h1_tags and len(h1_tags) > 0:
title = h1_tags[0].get_text().strip()
# Method 3: Look for blog titles in meta tags
if not title:
og_title = soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
title = og_title['content'].strip()
return title or "Unknown Title"
def _extract_publish_date(self, soup: BeautifulSoup) -> str:
"""Extract publication date"""
# Try various methods to find date
date = None
# Method 1: Look for common date meta tags
date_meta = soup.find('meta', property='article:published_time')
if date_meta and date_meta.get('content'):
date = date_meta['content']
# Method 2: Look for common blog date classes
if not date:
date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta']
for class_name in date_classes:
date_element = soup.find(class_=re.compile(class_name, re.I))
if date_element:
date = date_element.get_text().strip()
break
return date or "Unknown Date"
def _extract_author(self, soup: BeautifulSoup) -> str:
"""Extract author information"""
# Try various methods to find author
author = None
# Method 1: Look for author meta tags
author_meta = soup.find('meta', property='article:author')
if author_meta and author_meta.get('content'):
author = author_meta['content']
# Method 2: Look for blog-specific author classes
if not author:
author_classes = ['author', 'byline', 'entry-author', 'post-author']
for class_name in author_classes:
author_element = soup.find(class_=re.compile(class_name, re.I))
if author_element:
author = author_element.get_text().strip()
break
return author or "Unknown Author"
def _extract_categories(self, soup: BeautifulSoup) -> List[str]:
"""Extract blog post categories"""
categories = []
# Method 1: Look for category links
category_elements = soup.find_all('a', class_=re.compile('category'))
if category_elements:
for element in category_elements:
cat_text = element.get_text().strip()
if cat_text and cat_text not in categories:
categories.append(cat_text)
# Method 2: Look for category meta tag
if not categories:
category_meta = soup.find('meta', property='article:section')
if category_meta and category_meta.get('content'):
categories.append(category_meta['content'].strip())
return categories
def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
"""Extract blog post tags"""
tags = []
# Look for tag links
tag_elements = soup.find_all('a', class_=re.compile('tag'))
if tag_elements:
for element in tag_elements:
tag_text = element.get_text().strip()
if tag_text and tag_text not in tags:
tags.append(tag_text)
return tags
def _extract_summary(self, text_content: str) -> str:
"""Extract or create a summary from the blog post text"""
if not text_content:
return "No summary available"
# Take first paragraph or first few sentences (up to 300 chars)
paragraphs = text_content.split('\n\n')
if paragraphs:
summary = paragraphs[0]
if len(summary) > 300:
summary = summary[:297] + "..."
return summary
return "No summary available"
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
try:
from urllib.parse import urlparse
parsed_url = urlparse(url)
return parsed_url.netloc
except Exception:
return "Unknown Source"