File size: 4,871 Bytes
2c72e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import logging
import re
from typing import Dict, Any
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper

logger = logging.getLogger(__name__)

class NewsScraper(BaseScraper):
    """Scraper for news websites"""
    
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse news content and extract structured data"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Try to extract metadata
            result = {
                "type": "news",
                "title": self._extract_title(soup),
                "publish_date": self._extract_publish_date(soup),
                "author": self._extract_author(soup),
                "summary": self._extract_summary(text_content),
                "source": self._extract_domain(url),
            }
            
            return result
        except Exception as e:
            logger.error(f"Error parsing news content: {str(e)}")
            return {"type": "news", "error_parsing": str(e)}
    
    def _extract_title(self, soup: BeautifulSoup) -> str:
        """Extract title from article"""
        # Try different methods to find title
        title = None
        
        # Method 1: Look for <h1> tags
        h1_tags = soup.find_all('h1')
        if h1_tags and len(h1_tags) > 0:
            title = h1_tags[0].get_text().strip()
        
        # Method 2: Look for article titles in meta tags
        if not title:
            og_title = soup.find('meta', property='og:title')
            if og_title and og_title.get('content'):
                title = og_title['content'].strip()
        
        # Method 3: Use the document title
        if not title:
            title_tag = soup.find('title')
            if title_tag:
                title = title_tag.get_text().strip()
        
        return title or "Unknown Title"
    
    def _extract_publish_date(self, soup: BeautifulSoup) -> str:
        """Extract publication date"""
        # Try various methods to find date
        date = None
        
        # Method 1: Look for common date meta tags
        date_meta = soup.find('meta', property='article:published_time')
        if date_meta and date_meta.get('content'):
            date = date_meta['content']
        
        # Method 2: Look for time tags
        if not date:
            time_tag = soup.find('time')
            if time_tag and time_tag.get('datetime'):
                date = time_tag['datetime']
            elif time_tag:
                date = time_tag.get_text().strip()
        
        # Method 3: Look for date in common class names
        if not date:
            date_classes = ['date', 'article-date', 'publish-date', 'timestamp']
            for class_name in date_classes:
                date_element = soup.find(class_=re.compile(class_name, re.I))
                if date_element:
                    date = date_element.get_text().strip()
                    break
        
        return date or "Unknown Date"
    
    def _extract_author(self, soup: BeautifulSoup) -> str:
        """Extract author information"""
        # Try various methods to find author
        author = None
        
        # Method 1: Look for author meta tags
        author_meta = soup.find('meta', property='article:author')
        if author_meta and author_meta.get('content'):
            author = author_meta['content']
        
        # Method 2: Look for author in common class names
        if not author:
            author_classes = ['author', 'byline', 'writer']
            for class_name in author_classes:
                author_element = soup.find(class_=re.compile(class_name, re.I))
                if author_element:
                    author = author_element.get_text().strip()
                    break
        
        # Method 3: Look for rel="author" link
        if not author:
            author_link = soup.find('a', rel='author')
            if author_link:
                author = author_link.get_text().strip()
        
        return author or "Unknown Author"
    
    def _extract_summary(self, text_content: str) -> str:
        """Extract or create a summary from the article text"""
        if not text_content:
            return "No summary available"
        
        # Take first few sentences (up to 500 chars)
        sentences = text_content.split('.')
        summary = '.'.join(sentences[:3])
        
        if len(summary) > 500:
            summary = summary[:497] + "..."
            
        return summary
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed_url = urlparse(url)
            return parsed_url.netloc
        except Exception:
            return "Unknown Source"