Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /blog_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 27 days ago

6.02 kB

	import logging
	import re
	from typing import Dict, Any, List
	from bs4 import BeautifulSoup
	from .base_scraper import BaseScraper

	logger = logging.getLogger(__name__)

	class BlogScraper(BaseScraper):
	"""Scraper for blog websites"""

	def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
	"""Parse blog content and extract structured data"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract metadata
	result = {
	"type": "blog",
	"title": self._extract_title(soup),
	"publish_date": self._extract_publish_date(soup),
	"author": self._extract_author(soup),
	"categories": self._extract_categories(soup),
	"tags": self._extract_tags(soup),
	"summary": self._extract_summary(text_content),
	"source": self._extract_domain(url),
	}

	return result
	except Exception as e:
	logger.error(f"Error parsing blog content: {str(e)}")
	return {"type": "blog", "error_parsing": str(e)}

	def _extract_title(self, soup: BeautifulSoup) -> str:
	"""Extract title from blog post"""
	# Try different methods to find title
	title = None

	# Method 1: Look for <h1> tags in article or entry
	article = soup.find(['article', 'div'], class_=re.compile('(post\|entry\|article)'))
	if article:
	h1 = article.find('h1')
	if h1:
	title = h1.get_text().strip()

	# Method 2: Look for any h1 if above failed
	if not title:
	h1_tags = soup.find_all('h1')
	if h1_tags and len(h1_tags) > 0:
	title = h1_tags[0].get_text().strip()

	# Method 3: Look for blog titles in meta tags
	if not title:
	og_title = soup.find('meta', property='og:title')
	if og_title and og_title.get('content'):
	title = og_title['content'].strip()

	return title or "Unknown Title"

	def _extract_publish_date(self, soup: BeautifulSoup) -> str:
	"""Extract publication date"""
	# Try various methods to find date
	date = None

	# Method 1: Look for common date meta tags
	date_meta = soup.find('meta', property='article:published_time')
	if date_meta and date_meta.get('content'):
	date = date_meta['content']

	# Method 2: Look for common blog date classes
	if not date:
	date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta']
	for class_name in date_classes:
	date_element = soup.find(class_=re.compile(class_name, re.I))
	if date_element:
	date = date_element.get_text().strip()
	break

	return date or "Unknown Date"

	def _extract_author(self, soup: BeautifulSoup) -> str:
	"""Extract author information"""
	# Try various methods to find author
	author = None

	# Method 1: Look for author meta tags
	author_meta = soup.find('meta', property='article:author')
	if author_meta and author_meta.get('content'):
	author = author_meta['content']

	# Method 2: Look for blog-specific author classes
	if not author:
	author_classes = ['author', 'byline', 'entry-author', 'post-author']
	for class_name in author_classes:
	author_element = soup.find(class_=re.compile(class_name, re.I))
	if author_element:
	author = author_element.get_text().strip()
	break

	return author or "Unknown Author"

	def _extract_categories(self, soup: BeautifulSoup) -> List[str]:
	"""Extract blog post categories"""
	categories = []

	# Method 1: Look for category links
	category_elements = soup.find_all('a', class_=re.compile('category'))
	if category_elements:
	for element in category_elements:
	cat_text = element.get_text().strip()
	if cat_text and cat_text not in categories:
	categories.append(cat_text)

	# Method 2: Look for category meta tag
	if not categories:
	category_meta = soup.find('meta', property='article:section')
	if category_meta and category_meta.get('content'):
	categories.append(category_meta['content'].strip())

	return categories

	def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
	"""Extract blog post tags"""
	tags = []

	# Look for tag links
	tag_elements = soup.find_all('a', class_=re.compile('tag'))
	if tag_elements:
	for element in tag_elements:
	tag_text = element.get_text().strip()
	if tag_text and tag_text not in tags:
	tags.append(tag_text)

	return tags

	def _extract_summary(self, text_content: str) -> str:
	"""Extract or create a summary from the blog post text"""
	if not text_content:
	return "No summary available"

	# Take first paragraph or first few sentences (up to 300 chars)
	paragraphs = text_content.split('\n\n')
	if paragraphs:
	summary = paragraphs[0]
	if len(summary) > 300:
	summary = summary[:297] + "..."
	return summary

	return "No summary available"

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL"""
	try:
	from urllib.parse import urlparse
	parsed_url = urlparse(url)
	return parsed_url.netloc
	except Exception:
	return "Unknown Source"