Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /base_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 27 days ago

2.69 kB

	import logging
	import requests
	from abc import ABC, abstractmethod
	from typing import Dict, Any, Optional
	import trafilatura

	logger = logging.getLogger(__name__)

	class BaseScraper(ABC):
	"""Base class for all scrapers"""

	def __init__(self, timeout: int = 30):
	self.timeout = timeout
	self.headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	}

	def fetch_url(self, url: str) -> Optional[str]:
	"""Fetch content from URL using trafilatura"""
	try:
	logger.debug(f"Fetching URL: {url}")
	# Note: trafilatura.fetch_url doesn't accept a timeout parameter directly
	downloaded = trafilatura.fetch_url(url)
	if not downloaded:
	logger.error(f"Failed to download content from {url}")
	return None
	return downloaded
	except Exception as e:
	logger.error(f"Error fetching URL {url}: {str(e)}")
	return None

	def extract_text(self, html_content: str) -> Optional[str]:
	"""Extract main text content from HTML"""
	try:
	if not html_content:
	return None
	text = trafilatura.extract(html_content)
	return text
	except Exception as e:
	logger.error(f"Error extracting text: {str(e)}")
	return None

	def scrape(self, url: str) -> Dict[str, Any]:
	"""Main scrape method to be implemented by subclasses"""
	try:
	html_content = self.fetch_url(url)
	if not html_content:
	return {"success": False, "error": "Failed to fetch URL"}

	text_content = self.extract_text(html_content)
	if not text_content:
	return {"success": False, "error": "Failed to extract text content"}

	result = self.parse_content(html_content, text_content, url)
	result["success"] = True
	result["url"] = url
	result["text_content"] = text_content

	return result
	except Exception as e:
	logger.error(f"Error in scraping {url}: {str(e)}")
	return {"success": False, "error": str(e), "url": url}

	@abstractmethod
	def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
	"""Parse the content and extract structured data"""
	pass