Spaces:

pareshmishra
/

MT564AITraining

Running

File size: 4,871 Bytes

2c72e40

import logging
import re
from typing import Dict, Any
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper

logger = logging.getLogger(__name__)

class NewsScraper(BaseScraper):
    """Scraper for news websites"""
    
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse news content and extract structured data"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Try to extract metadata
            result = {
                "type": "news",
                "title": self._extract_title(soup),
                "publish_date": self._extract_publish_date(soup),
                "author": self._extract_author(soup),
                "summary": self._extract_summary(text_content),
                "source": self._extract_domain(url),
            }
            
            return result
        except Exception as e:
            logger.error(f"Error parsing news content: {str(e)}")
            return {"type": "news", "error_parsing": str(e)}
    
    def _extract_title(self, soup: BeautifulSoup) -> str:
        """Extract title from article"""
        # Try different methods to find title
        title = None
        
        # Method 1: Look for <h1> tags
        h1_tags = soup.find_all('h1')
        if h1_tags and len(h1_tags) > 0:
            title = h1_tags[0].get_text().strip()
        
        # Method 2: Look for article titles in meta tags
        if not title:
            og_title = soup.find('meta', property='og:title')
            if og_title and og_title.get('content'):
                title = og_title['content'].strip()
        
        # Method 3: Use the document title
        if not title:
            title_tag = soup.find('title')
            if title_tag:
                title = title_tag.get_text().strip()
        
        return title or "Unknown Title"
    
    def _extract_publish_date(self, soup: BeautifulSoup) -> str:
        """Extract publication date"""
        # Try various methods to find date
        date = None
        
        # Method 1: Look for common date meta tags
        date_meta = soup.find('meta', property='article:published_time')
        if date_meta and date_meta.get('content'):
            date = date_meta['content']
        
        # Method 2: Look for time tags
        if not date:
            time_tag = soup.find('time')
            if time_tag and time_tag.get('datetime'):
                date = time_tag['datetime']
            elif time_tag:
                date = time_tag.get_text().strip()
        
        # Method 3: Look for date in common class names
        if not date:
            date_classes = ['date', 'article-date', 'publish-date', 'timestamp']
            for class_name in date_classes:
                date_element = soup.find(class_=re.compile(class_name, re.I))
                if date_element:
                    date = date_element.get_text().strip()
                    break
        
        return date or "Unknown Date"
    
    def _extract_author(self, soup: BeautifulSoup) -> str:
        """Extract author information"""
        # Try various methods to find author
        author = None
        
        # Method 1: Look for author meta tags
        author_meta = soup.find('meta', property='article:author')
        if author_meta and author_meta.get('content'):
            author = author_meta['content']
        
        # Method 2: Look for author in common class names
        if not author:
            author_classes = ['author', 'byline', 'writer']
            for class_name in author_classes:
                author_element = soup.find(class_=re.compile(class_name, re.I))
                if author_element:
                    author = author_element.get_text().strip()
                    break
        
        # Method 3: Look for rel="author" link
        if not author:
            author_link = soup.find('a', rel='author')
            if author_link:
                author = author_link.get_text().strip()
        
        return author or "Unknown Author"
    
    def _extract_summary(self, text_content: str) -> str:
        """Extract or create a summary from the article text"""
        if not text_content:
            return "No summary available"
        
        # Take first few sentences (up to 500 chars)
        sentences = text_content.split('.')
        summary = '.'.join(sentences[:3])
        
        if len(summary) > 500:
            summary = summary[:497] + "..."
            
        return summary
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed_url = urlparse(url)
            return parsed_url.netloc
        except Exception:
            return "Unknown Source"