Spaces:
Running
Running
File size: 4,871 Bytes
2c72e40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import logging
import re
from typing import Dict, Any
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper
logger = logging.getLogger(__name__)
class NewsScraper(BaseScraper):
"""Scraper for news websites"""
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse news content and extract structured data"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Try to extract metadata
result = {
"type": "news",
"title": self._extract_title(soup),
"publish_date": self._extract_publish_date(soup),
"author": self._extract_author(soup),
"summary": self._extract_summary(text_content),
"source": self._extract_domain(url),
}
return result
except Exception as e:
logger.error(f"Error parsing news content: {str(e)}")
return {"type": "news", "error_parsing": str(e)}
def _extract_title(self, soup: BeautifulSoup) -> str:
"""Extract title from article"""
# Try different methods to find title
title = None
# Method 1: Look for <h1> tags
h1_tags = soup.find_all('h1')
if h1_tags and len(h1_tags) > 0:
title = h1_tags[0].get_text().strip()
# Method 2: Look for article titles in meta tags
if not title:
og_title = soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
title = og_title['content'].strip()
# Method 3: Use the document title
if not title:
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip()
return title or "Unknown Title"
def _extract_publish_date(self, soup: BeautifulSoup) -> str:
"""Extract publication date"""
# Try various methods to find date
date = None
# Method 1: Look for common date meta tags
date_meta = soup.find('meta', property='article:published_time')
if date_meta and date_meta.get('content'):
date = date_meta['content']
# Method 2: Look for time tags
if not date:
time_tag = soup.find('time')
if time_tag and time_tag.get('datetime'):
date = time_tag['datetime']
elif time_tag:
date = time_tag.get_text().strip()
# Method 3: Look for date in common class names
if not date:
date_classes = ['date', 'article-date', 'publish-date', 'timestamp']
for class_name in date_classes:
date_element = soup.find(class_=re.compile(class_name, re.I))
if date_element:
date = date_element.get_text().strip()
break
return date or "Unknown Date"
def _extract_author(self, soup: BeautifulSoup) -> str:
"""Extract author information"""
# Try various methods to find author
author = None
# Method 1: Look for author meta tags
author_meta = soup.find('meta', property='article:author')
if author_meta and author_meta.get('content'):
author = author_meta['content']
# Method 2: Look for author in common class names
if not author:
author_classes = ['author', 'byline', 'writer']
for class_name in author_classes:
author_element = soup.find(class_=re.compile(class_name, re.I))
if author_element:
author = author_element.get_text().strip()
break
# Method 3: Look for rel="author" link
if not author:
author_link = soup.find('a', rel='author')
if author_link:
author = author_link.get_text().strip()
return author or "Unknown Author"
def _extract_summary(self, text_content: str) -> str:
"""Extract or create a summary from the article text"""
if not text_content:
return "No summary available"
# Take first few sentences (up to 500 chars)
sentences = text_content.split('.')
summary = '.'.join(sentences[:3])
if len(summary) > 500:
summary = summary[:497] + "..."
return summary
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
try:
from urllib.parse import urlparse
parsed_url = urlparse(url)
return parsed_url.netloc
except Exception:
return "Unknown Source"
|