Spaces:
Running
Running
import logging | |
import re | |
from typing import Dict, Any | |
from bs4 import BeautifulSoup | |
from .base_scraper import BaseScraper | |
logger = logging.getLogger(__name__) | |
class NewsScraper(BaseScraper): | |
"""Scraper for news websites""" | |
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]: | |
"""Parse news content and extract structured data""" | |
try: | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Try to extract metadata | |
result = { | |
"type": "news", | |
"title": self._extract_title(soup), | |
"publish_date": self._extract_publish_date(soup), | |
"author": self._extract_author(soup), | |
"summary": self._extract_summary(text_content), | |
"source": self._extract_domain(url), | |
} | |
return result | |
except Exception as e: | |
logger.error(f"Error parsing news content: {str(e)}") | |
return {"type": "news", "error_parsing": str(e)} | |
def _extract_title(self, soup: BeautifulSoup) -> str: | |
"""Extract title from article""" | |
# Try different methods to find title | |
title = None | |
# Method 1: Look for <h1> tags | |
h1_tags = soup.find_all('h1') | |
if h1_tags and len(h1_tags) > 0: | |
title = h1_tags[0].get_text().strip() | |
# Method 2: Look for article titles in meta tags | |
if not title: | |
og_title = soup.find('meta', property='og:title') | |
if og_title and og_title.get('content'): | |
title = og_title['content'].strip() | |
# Method 3: Use the document title | |
if not title: | |
title_tag = soup.find('title') | |
if title_tag: | |
title = title_tag.get_text().strip() | |
return title or "Unknown Title" | |
def _extract_publish_date(self, soup: BeautifulSoup) -> str: | |
"""Extract publication date""" | |
# Try various methods to find date | |
date = None | |
# Method 1: Look for common date meta tags | |
date_meta = soup.find('meta', property='article:published_time') | |
if date_meta and date_meta.get('content'): | |
date = date_meta['content'] | |
# Method 2: Look for time tags | |
if not date: | |
time_tag = soup.find('time') | |
if time_tag and time_tag.get('datetime'): | |
date = time_tag['datetime'] | |
elif time_tag: | |
date = time_tag.get_text().strip() | |
# Method 3: Look for date in common class names | |
if not date: | |
date_classes = ['date', 'article-date', 'publish-date', 'timestamp'] | |
for class_name in date_classes: | |
date_element = soup.find(class_=re.compile(class_name, re.I)) | |
if date_element: | |
date = date_element.get_text().strip() | |
break | |
return date or "Unknown Date" | |
def _extract_author(self, soup: BeautifulSoup) -> str: | |
"""Extract author information""" | |
# Try various methods to find author | |
author = None | |
# Method 1: Look for author meta tags | |
author_meta = soup.find('meta', property='article:author') | |
if author_meta and author_meta.get('content'): | |
author = author_meta['content'] | |
# Method 2: Look for author in common class names | |
if not author: | |
author_classes = ['author', 'byline', 'writer'] | |
for class_name in author_classes: | |
author_element = soup.find(class_=re.compile(class_name, re.I)) | |
if author_element: | |
author = author_element.get_text().strip() | |
break | |
# Method 3: Look for rel="author" link | |
if not author: | |
author_link = soup.find('a', rel='author') | |
if author_link: | |
author = author_link.get_text().strip() | |
return author or "Unknown Author" | |
def _extract_summary(self, text_content: str) -> str: | |
"""Extract or create a summary from the article text""" | |
if not text_content: | |
return "No summary available" | |
# Take first few sentences (up to 500 chars) | |
sentences = text_content.split('.') | |
summary = '.'.join(sentences[:3]) | |
if len(summary) > 500: | |
summary = summary[:497] + "..." | |
return summary | |
def _extract_domain(self, url: str) -> str: | |
"""Extract domain from URL""" | |
try: | |
from urllib.parse import urlparse | |
parsed_url = urlparse(url) | |
return parsed_url.netloc | |
except Exception: | |
return "Unknown Source" | |