ragV98's picture
ref changes and scraper changes
fbd9dbe
import requests
import trafilatura
from newspaper import Article
from typing import Optional
from bs4 import BeautifulSoup
import logging
import re
# Configure logging at the beginning of your script or module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
def clean_text(text: str) -> str:
"""
Cleans extracted text by removing HTML tags, normalizing whitespace,
and optionally removing common non-content patterns.
"""
if not text:
return ""
soup = BeautifulSoup(text, "html.parser")
# Add double newlines after paragraphs to preserve some structure
for p in soup.find_all('p'):
p.append('\n\n')
cleaned = soup.get_text(separator=" ", strip=True)
# Normalize all whitespace characters to single spaces, then strip leading/trailing
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned
def is_low_quality(text: str) -> bool:
"""
Detect navigation garbage, footers, or low-word-count dumps.
Uses an expanded list of junk markers and word count checks.
"""
if not text:
logging.debug("Text is empty, considered low quality.")
return True
words = text.split()
if len(words) < 150: # Increased minimum word count slightly for better content
logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).")
return True
# Expanded list of common junk phrases/markers
junk_markers = [
"subscribe to our newsletter", "cookie policy", "terms and conditions",
"privacy statement", "all rights reserved", "contact us", "about us",
"careers", "sitemap", "advertisement", "sponsored content",
"read more", "view all", "back to top", "connect with us",
"follow us on", "email us", "download our app", "footer",
"comments policy", "disclaimer", "affiliate links", "related posts",
"latest updates", "breaking news", "trending topics", "more news",
"featured stories", "sign up", "login", "register", "join us",
"newsletter signup", "skip to content", "navigation", "main menu",
"sidebar", "archive", "categories", "tags", "go to top", "licence",
"unlimited access", "support us", "exclusive content", "follow @",
"copyright", "imprint", "impressum", "legal notice"
]
low_quality_score = 0
lower_text = text.lower()
for marker in junk_markers:
if marker in lower_text:
low_quality_score += 1
if low_quality_score >= 4:
logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
return True
lines = text.split('\n')
if len(lines) > 15:
short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7)
if short_lines_count / len(lines) > 0.4:
logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
return True
return False
def scrape_url(url: str, timeout: int = 15) -> Optional[str]:
logging.info(f"Attempting to scrape: {url}")
# Try Trafilatura first
try:
response = requests.get(url, timeout=timeout, headers=HEADERS)
response.raise_for_status()
try:
html = response.content.decode(response.apparent_encoding)
except UnicodeDecodeError:
html = response.content.decode('utf-8', errors='ignore')
extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
if extracted:
text = clean_text(extracted)
if not is_low_quality(text):
logging.info(f"Successfully extracted content using Trafilatura for: {url}")
return text
else:
# Log when content is identified as low quality by Trafilatura
logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.")
else:
logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")
except requests.exceptions.RequestException as req_err:
logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
except Exception as e:
logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False)
# Fallback to newspaper3k
try:
article = Article(url, headers=HEADERS, keep_article_html=False)
article.download()
article.parse()
if article.text:
text = clean_text(article.text)
if not is_low_quality(text):
logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
return text
else:
# Log when content is identified as low quality by Newspaper3k
logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.")
else:
logging.info(f"Newspaper3k returned no main content for: {url}.")
except requests.exceptions.RequestException as req_err:
logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}")
except Exception as e:
logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False)
logging.error(f"Failed to extract quality content from: {url} using both methods.")
return None