import requests import trafilatura from newspaper import Article from typing import Optional from bs4 import BeautifulSoup import logging import re # Configure logging at the beginning of your script or module logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/115.0.0.0 Safari/537.36" ) } def clean_text(text: str) -> str: """ Cleans extracted text by removing HTML tags, normalizing whitespace, and optionally removing common non-content patterns. """ if not text: return "" soup = BeautifulSoup(text, "html.parser") # Add double newlines after paragraphs to preserve some structure for p in soup.find_all('p'): p.append('\n\n') cleaned = soup.get_text(separator=" ", strip=True) # Normalize all whitespace characters to single spaces, then strip leading/trailing cleaned = re.sub(r'\s+', ' ', cleaned).strip() return cleaned def is_low_quality(text: str) -> bool: """ Detect navigation garbage, footers, or low-word-count dumps. Uses an expanded list of junk markers and word count checks. """ if not text: logging.debug("Text is empty, considered low quality.") return True words = text.split() if len(words) < 150: # Increased minimum word count slightly for better content logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).") return True # Expanded list of common junk phrases/markers junk_markers = [ "subscribe to our newsletter", "cookie policy", "terms and conditions", "privacy statement", "all rights reserved", "contact us", "about us", "careers", "sitemap", "advertisement", "sponsored content", "read more", "view all", "back to top", "connect with us", "follow us on", "email us", "download our app", "footer", "comments policy", "disclaimer", "affiliate links", "related posts", "latest updates", "breaking news", "trending topics", "more news", "featured stories", "sign up", "login", "register", "join us", "newsletter signup", "skip to content", "navigation", "main menu", "sidebar", "archive", "categories", "tags", "go to top", "licence", "unlimited access", "support us", "exclusive content", "follow @", "copyright", "imprint", "impressum", "legal notice" ] low_quality_score = 0 lower_text = text.lower() for marker in junk_markers: if marker in lower_text: low_quality_score += 1 if low_quality_score >= 4: logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.") return True lines = text.split('\n') if len(lines) > 15: short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) if short_lines_count / len(lines) > 0.4: logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.") return True return False def scrape_url(url: str, timeout: int = 15) -> Optional[str]: logging.info(f"Attempting to scrape: {url}") # Try Trafilatura first try: response = requests.get(url, timeout=timeout, headers=HEADERS) response.raise_for_status() try: html = response.content.decode(response.apparent_encoding) except UnicodeDecodeError: html = response.content.decode('utf-8', errors='ignore') extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False) if extracted: text = clean_text(extracted) if not is_low_quality(text): logging.info(f"Successfully extracted content using Trafilatura for: {url}") return text else: # Log when content is identified as low quality by Trafilatura logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.") else: logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.") except requests.exceptions.RequestException as req_err: logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}") except Exception as e: logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) # Fallback to newspaper3k try: article = Article(url, headers=HEADERS, keep_article_html=False) article.download() article.parse() if article.text: text = clean_text(article.text) if not is_low_quality(text): logging.info(f"Successfully extracted content using Newspaper3k for: {url}") return text else: # Log when content is identified as low quality by Newspaper3k logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.") else: logging.info(f"Newspaper3k returned no main content for: {url}.") except requests.exceptions.RequestException as req_err: logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}") except Exception as e: logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False) logging.error(f"Failed to extract quality content from: {url} using both methods.") return None