Spaces:

nuseAI
/

fastAPIv2

Running

App Files Files Community

fastAPIv2 / components /fetchers /scraper.py

ragV98

ref changes and scraper changes

fbd9dbe 21 days ago

raw

history blame contribute delete

5.72 kB

	import requests
	import trafilatura
	from newspaper import Article
	from typing import Optional
	from bs4 import BeautifulSoup
	import logging
	import re

	# Configure logging at the beginning of your script or module
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/115.0.0.0 Safari/537.36"
	)
	}

	def clean_text(text: str) -> str:
	"""
	Cleans extracted text by removing HTML tags, normalizing whitespace,
	and optionally removing common non-content patterns.
	"""
	if not text:
	return ""

	soup = BeautifulSoup(text, "html.parser")

	# Add double newlines after paragraphs to preserve some structure
	for p in soup.find_all('p'):
	p.append('\n\n')

	cleaned = soup.get_text(separator=" ", strip=True)

	# Normalize all whitespace characters to single spaces, then strip leading/trailing
	cleaned = re.sub(r'\s+', ' ', cleaned).strip()

	return cleaned

	def is_low_quality(text: str) -> bool:
	"""
	Detect navigation garbage, footers, or low-word-count dumps.
	Uses an expanded list of junk markers and word count checks.
	"""
	if not text:
	logging.debug("Text is empty, considered low quality.")
	return True

	words = text.split()
	if len(words) < 150: # Increased minimum word count slightly for better content
	logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).")
	return True

	# Expanded list of common junk phrases/markers
	junk_markers = [
	"subscribe to our newsletter", "cookie policy", "terms and conditions",
	"privacy statement", "all rights reserved", "contact us", "about us",
	"careers", "sitemap", "advertisement", "sponsored content",
	"read more", "view all", "back to top", "connect with us",
	"follow us on", "email us", "download our app", "footer",
	"comments policy", "disclaimer", "affiliate links", "related posts",
	"latest updates", "breaking news", "trending topics", "more news",
	"featured stories", "sign up", "login", "register", "join us",
	"newsletter signup", "skip to content", "navigation", "main menu",
	"sidebar", "archive", "categories", "tags", "go to top", "licence",
	"unlimited access", "support us", "exclusive content", "follow @",
	"copyright", "imprint", "impressum", "legal notice"
	]

	low_quality_score = 0
	lower_text = text.lower()

	for marker in junk_markers:
	if marker in lower_text:
	low_quality_score += 1

	if low_quality_score >= 4:
	logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
	return True

	lines = text.split('\n')
	if len(lines) > 15:
	short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7)
	if short_lines_count / len(lines) > 0.4:
	logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
	return True

	return False

	def scrape_url(url: str, timeout: int = 15) -> Optional[str]:
	logging.info(f"Attempting to scrape: {url}")

	# Try Trafilatura first
	try:
	response = requests.get(url, timeout=timeout, headers=HEADERS)
	response.raise_for_status()

	try:
	html = response.content.decode(response.apparent_encoding)
	except UnicodeDecodeError:
	html = response.content.decode('utf-8', errors='ignore')

	extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)

	if extracted:
	text = clean_text(extracted)
	if not is_low_quality(text):
	logging.info(f"Successfully extracted content using Trafilatura for: {url}")
	return text
	else:
	# Log when content is identified as low quality by Trafilatura
	logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.")
	else:
	logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")

	except requests.exceptions.RequestException as req_err:
	logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
	except Exception as e:
	logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False)

	# Fallback to newspaper3k
	try:
	article = Article(url, headers=HEADERS, keep_article_html=False)
	article.download()
	article.parse()
	if article.text:
	text = clean_text(article.text)
	if not is_low_quality(text):
	logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
	return text
	else:
	# Log when content is identified as low quality by Newspaper3k
	logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.")
	else:
	logging.info(f"Newspaper3k returned no main content for: {url}.")
	except requests.exceptions.RequestException as req_err:
	logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}")
	except Exception as e:
	logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False)

	logging.error(f"Failed to extract quality content from: {url} using both methods.")
	return None