import logging
import re
from typing import Dict, Any
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper
logger = logging.getLogger(__name__)
class NewsScraper(BaseScraper):
"""Scraper for news websites"""
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse news content and extract structured data"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Try to extract metadata
result = {
"type": "news",
"title": self._extract_title(soup),
"publish_date": self._extract_publish_date(soup),
"author": self._extract_author(soup),
"summary": self._extract_summary(text_content),
"source": self._extract_domain(url),
}
return result
except Exception as e:
logger.error(f"Error parsing news content: {str(e)}")
return {"type": "news", "error_parsing": str(e)}
def _extract_title(self, soup: BeautifulSoup) -> str:
"""Extract title from article"""
# Try different methods to find title
title = None
# Method 1: Look for
tags
h1_tags = soup.find_all('h1')
if h1_tags and len(h1_tags) > 0:
title = h1_tags[0].get_text().strip()
# Method 2: Look for article titles in meta tags
if not title:
og_title = soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
title = og_title['content'].strip()
# Method 3: Use the document title
if not title:
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip()
return title or "Unknown Title"
def _extract_publish_date(self, soup: BeautifulSoup) -> str:
"""Extract publication date"""
# Try various methods to find date
date = None
# Method 1: Look for common date meta tags
date_meta = soup.find('meta', property='article:published_time')
if date_meta and date_meta.get('content'):
date = date_meta['content']
# Method 2: Look for time tags
if not date:
time_tag = soup.find('time')
if time_tag and time_tag.get('datetime'):
date = time_tag['datetime']
elif time_tag:
date = time_tag.get_text().strip()
# Method 3: Look for date in common class names
if not date:
date_classes = ['date', 'article-date', 'publish-date', 'timestamp']
for class_name in date_classes:
date_element = soup.find(class_=re.compile(class_name, re.I))
if date_element:
date = date_element.get_text().strip()
break
return date or "Unknown Date"
def _extract_author(self, soup: BeautifulSoup) -> str:
"""Extract author information"""
# Try various methods to find author
author = None
# Method 1: Look for author meta tags
author_meta = soup.find('meta', property='article:author')
if author_meta and author_meta.get('content'):
author = author_meta['content']
# Method 2: Look for author in common class names
if not author:
author_classes = ['author', 'byline', 'writer']
for class_name in author_classes:
author_element = soup.find(class_=re.compile(class_name, re.I))
if author_element:
author = author_element.get_text().strip()
break
# Method 3: Look for rel="author" link
if not author:
author_link = soup.find('a', rel='author')
if author_link:
author = author_link.get_text().strip()
return author or "Unknown Author"
def _extract_summary(self, text_content: str) -> str:
"""Extract or create a summary from the article text"""
if not text_content:
return "No summary available"
# Take first few sentences (up to 500 chars)
sentences = text_content.split('.')
summary = '.'.join(sentences[:3])
if len(summary) > 500:
summary = summary[:497] + "..."
return summary
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
try:
from urllib.parse import urlparse
parsed_url = urlparse(url)
return parsed_url.netloc
except Exception:
return "Unknown Source"