|
import requests |
|
import trafilatura |
|
from newspaper import Article |
|
from typing import Optional |
|
from bs4 import BeautifulSoup |
|
import logging |
|
import re |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
HEADERS = { |
|
"User-Agent": ( |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
"Chrome/115.0.0.0 Safari/537.36" |
|
) |
|
} |
|
|
|
def clean_text(text: str) -> str: |
|
""" |
|
Cleans extracted text by removing HTML tags, normalizing whitespace, |
|
and optionally removing common non-content patterns. |
|
""" |
|
if not text: |
|
return "" |
|
|
|
soup = BeautifulSoup(text, "html.parser") |
|
|
|
|
|
for p in soup.find_all('p'): |
|
p.append('\n\n') |
|
|
|
cleaned = soup.get_text(separator=" ", strip=True) |
|
|
|
|
|
cleaned = re.sub(r'\s+', ' ', cleaned).strip() |
|
|
|
return cleaned |
|
|
|
def is_low_quality(text: str) -> bool: |
|
""" |
|
Detect navigation garbage, footers, or low-word-count dumps. |
|
Uses an expanded list of junk markers and word count checks. |
|
""" |
|
if not text: |
|
logging.debug("Text is empty, considered low quality.") |
|
return True |
|
|
|
words = text.split() |
|
if len(words) < 150: |
|
logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).") |
|
return True |
|
|
|
|
|
junk_markers = [ |
|
"subscribe to our newsletter", "cookie policy", "terms and conditions", |
|
"privacy statement", "all rights reserved", "contact us", "about us", |
|
"careers", "sitemap", "advertisement", "sponsored content", |
|
"read more", "view all", "back to top", "connect with us", |
|
"follow us on", "email us", "download our app", "footer", |
|
"comments policy", "disclaimer", "affiliate links", "related posts", |
|
"latest updates", "breaking news", "trending topics", "more news", |
|
"featured stories", "sign up", "login", "register", "join us", |
|
"newsletter signup", "skip to content", "navigation", "main menu", |
|
"sidebar", "archive", "categories", "tags", "go to top", "licence", |
|
"unlimited access", "support us", "exclusive content", "follow @", |
|
"copyright", "imprint", "impressum", "legal notice" |
|
] |
|
|
|
low_quality_score = 0 |
|
lower_text = text.lower() |
|
|
|
for marker in junk_markers: |
|
if marker in lower_text: |
|
low_quality_score += 1 |
|
|
|
if low_quality_score >= 4: |
|
logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.") |
|
return True |
|
|
|
lines = text.split('\n') |
|
if len(lines) > 15: |
|
short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) |
|
if short_lines_count / len(lines) > 0.4: |
|
logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.") |
|
return True |
|
|
|
return False |
|
|
|
def scrape_url(url: str, timeout: int = 15) -> Optional[str]: |
|
logging.info(f"Attempting to scrape: {url}") |
|
|
|
|
|
try: |
|
response = requests.get(url, timeout=timeout, headers=HEADERS) |
|
response.raise_for_status() |
|
|
|
try: |
|
html = response.content.decode(response.apparent_encoding) |
|
except UnicodeDecodeError: |
|
html = response.content.decode('utf-8', errors='ignore') |
|
|
|
extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False) |
|
|
|
if extracted: |
|
text = clean_text(extracted) |
|
if not is_low_quality(text): |
|
logging.info(f"Successfully extracted content using Trafilatura for: {url}") |
|
return text |
|
else: |
|
|
|
logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.") |
|
else: |
|
logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.") |
|
|
|
except requests.exceptions.RequestException as req_err: |
|
logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}") |
|
except Exception as e: |
|
logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False) |
|
|
|
|
|
try: |
|
article = Article(url, headers=HEADERS, keep_article_html=False) |
|
article.download() |
|
article.parse() |
|
if article.text: |
|
text = clean_text(article.text) |
|
if not is_low_quality(text): |
|
logging.info(f"Successfully extracted content using Newspaper3k for: {url}") |
|
return text |
|
else: |
|
|
|
logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.") |
|
else: |
|
logging.info(f"Newspaper3k returned no main content for: {url}.") |
|
except requests.exceptions.RequestException as req_err: |
|
logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}") |
|
except Exception as e: |
|
logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False) |
|
|
|
logging.error(f"Failed to extract quality content from: {url} using both methods.") |
|
return None |