Spaces:

MuhammadNoman7600
/

derm-ai

Running

App Files Files Community

derm-ai / app /services /websearch.py

muhammadnoman76

update

b3b4203 about 1 month ago

raw

history blame contribute delete

14.4 kB

	import re
	import warnings
	import requests
	from bs4 import BeautifulSoup
	import urllib.parse
	import time
	import random
	from urllib.parse import urlparse, parse_qs

	warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)

	class WebSearch:
	def __init__(self, num_results=4, max_chars_per_page=6000, max_images=10):
	self.num_results = num_results
	self.max_chars_per_page = max_chars_per_page
	self.reference = []
	self.results = []
	self.max_images = max_images
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'DNT': '1',
	'Connection': 'keep-alive',
	}
	# Common domains for direct content
	self.content_domains = [
	"wikipedia.org", "webmd.com", "mayoclinic.org", "healthline.com", "nih.gov",
	"clevelandclinic.org", "nhs.uk", "cdc.gov", "medlineplus.gov", "hopkinsmedicine.org"
	]
	# Ad and tracking domains to filter out
	self.blocked_domains = [
	"ad.doubleclick.net", "googleadservices.com", "bing.com/aclick", "duckduckgo.com/y.js",
	"amazon.com/s", "ads.google.com", "analytics", "tracker", "pixel", "adservice"
	]

	def is_valid_url(self, url):
	"""Check if URL is valid and not an ad/tracking URL"""
	if not url or len(url) < 10:
	return False

	try:
	parsed = urlparse(url)

	# Check if URL has a valid scheme and netloc
	if not all([parsed.scheme, parsed.netloc]):
	return False

	# Filter out ad/tracking URLs
	domain = parsed.netloc.lower()
	path = parsed.path.lower()
	query = parsed.query.lower()

	# Block URLs containing ad-related indicators
	for blocked in self.blocked_domains:
	if blocked in domain or blocked in path:
	return False

	# Block URLs with ad-related query parameters
	if any(param in query for param in ["ad", "click", "track", "clkid", "msclkid"]):
	return False

	# Extra check for redirect URLs
	if "redirect" in path or "goto" in path or "go.php" in path:
	return False

	# Reject extremely long URLs (often tracking)
	if len(url) > 500:
	return False

	return True

	except Exception:
	return False

	def clean_url(self, url):
	"""Clean the URL by removing tracking parameters"""
	try:
	parsed = urlparse(url)

	# List of known tracking parameters to remove
	tracking_params = [
	'utm_', 'ref_', 'ref=', 'refid', 'fbclid', 'gclid', 'msclkid', 'dclid',
	'zanpid', 'icid', 'igshid', 'mc_eid', '_hsenc', 'mkt_tok', 'yclid'
	]

	# Parse query parameters
	query_params = parse_qs(parsed.query)

	# Remove tracking parameters
	filtered_params = {
	k: v for k, v in query_params.items()
	if not any(tracker in k.lower() for tracker in tracking_params)
	}

	# Rebuild query string
	clean_query = urllib.parse.urlencode(filtered_params, doseq=True) if filtered_params else ""

	# Reconstruct URL
	clean_url = urllib.parse.urlunparse((
	parsed.scheme,
	parsed.netloc,
	parsed.path,
	parsed.params,
	clean_query,
	"" # Remove fragment
	))

	return clean_url

	except Exception:
	# If any error occurs, return the original URL
	return url

	def extract_real_url_from_redirect(self, url):
	"""Extract the actual URL from a redirect URL"""
	try:
	parsed = urlparse(url)

	# Handle DuckDuckGo redirects
	if "duckduckgo.com" in parsed.netloc and "u3=" in parsed.query:
	params = parse_qs(parsed.query)
	if "u3" in params and params["u3"]:
	redirect_url = params["u3"][0]
	# Handle nested redirects (like Bing redirects inside DuckDuckGo)
	if "bing.com/aclick" in redirect_url:
	bing_parsed = urlparse(redirect_url)
	bing_params = parse_qs(bing_parsed.query)
	if "u" in bing_params and bing_params["u"]:
	decoded_url = urllib.parse.unquote(bing_params["u"][0])
	return self.clean_url(decoded_url)
	return self.clean_url(redirect_url)

	# Handle Bing redirects
	if "bing.com/aclick" in url:
	params = parse_qs(parsed.query)
	if "u" in params and params["u"]:
	return self.clean_url(urllib.parse.unquote(params["u"][0]))

	return url

	except Exception:
	return url

	def extract_text_from_webpage(self, html_content):
	soup = BeautifulSoup(html_content, "html.parser")

	# Remove non-content elements
	for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg",
	"aside", "iframe", "noscript", "img", "figure", "button"]):
	tag.extract()

	# Extract text and normalize spacing
	text = ' '.join(soup.stripped_strings)
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def search(self, query):
	results = []
	encoded_query = urllib.parse.quote(query)
	url = f'https://html.duckduckgo.com/html/?q={encoded_query}'

	try:
	with requests.Session() as session:
	session.headers.update(self.headers)

	response = session.get(url, timeout=10)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Getting more results than needed to account for filtering
	search_results = soup.find_all('div', class_='result')[:self.num_results * 2]
	links = []

	# Extract and process links
	for result in search_results:
	link_tag = result.find('a', class_='result__a')
	if not link_tag or not link_tag.get('href'):
	continue

	original_link = link_tag['href']

	# Process link to get the actual URL
	clean_link = self.extract_real_url_from_redirect(original_link)

	# Validate the URL
	if self.is_valid_url(clean_link):
	links.append(clean_link)

	# Prioritize content domains
	prioritized_links = []
	other_links = []

	for link in links:
	if any(domain in link for domain in self.content_domains):
	prioritized_links.append(link)
	else:
	other_links.append(link)

	# Combine prioritized links first, then others
	final_links = prioritized_links + other_links

	# Limit to unique links up to num_results
	unique_links = []
	seen_domains = set()

	for link in final_links:
	domain = urlparse(link).netloc
	if domain not in seen_domains and len(unique_links) < self.num_results:
	unique_links.append(link)
	seen_domains.add(domain)

	from concurrent.futures import ThreadPoolExecutor, as_completed

	def fetch_page(link):
	try:
	# Random delay to avoid being blocked
	time.sleep(random.uniform(0.5, 1.5))

	# Set a longer timeout for reliable fetching
	page_response = session.get(link, timeout=10, verify=False)

	# Only process HTML content
	if 'text/html' not in page_response.headers.get('Content-Type', ''):
	return None

	page_soup = BeautifulSoup(page_response.text, 'lxml')

	# Remove non-content elements
	[tag.decompose() for tag in page_soup(['script', 'style', 'header', 'footer',
	'nav', 'form', 'iframe', 'noscript'])]

	# Extract text with better formatting
	text = ' '.join(page_soup.stripped_strings)
	text = re.sub(r'\s+', ' ', text).strip()

	title = page_soup.title.string if page_soup.title else "Untitled Page"

	return {
	'link': link,
	'title': title,
	'text': text[:self.max_chars_per_page]
	}
	except Exception as e:
	print(f"Error fetching {link}: {str(e)}")
	return None

	with ThreadPoolExecutor(max_workers=min(len(unique_links), 4)) as executor:
	future_to_url = {executor.submit(fetch_page, link): link for link in unique_links}

	for future in as_completed(future_to_url):
	result = future.result()
	if result:
	results.append(result)

	return results

	except Exception as e:
	print(f"Search error: {str(e)}")
	return []

	def search_images(self, query):
	images = []
	encoded_query = urllib.parse.quote(query)

	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	}

	# Try multiple sources for better results
	image_sources = [
	f"https://www.google.com/search?q={encoded_query}&tbm=isch&hl=en",
	f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1",
	f"https://duckduckgo.com/?q={encoded_query}&iar=images&iax=images&ia=images"
	]

	for source_url in image_sources:
	try:
	time.sleep(random.uniform(0.5, 1.0)) # Polite delay
	response = requests.get(source_url, headers=headers, verify=False, timeout=10)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract image URLs from img tags
	for img in soup.find_all('img'):
	src = img.get('src', '')
	if src and src.startswith('http') and self.is_image_url(src):
	cleaned_url = self.clean_url(src)
	if self.is_valid_image(cleaned_url):
	images.append(cleaned_url)

	# Extract image URLs from scripts (useful for Google Images)
	for script in soup.find_all('script'):
	if script.string:
	urls = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg\|jpeg\|png\|gif\|bmp\|webp))', script.string)
	for url in urls:
	cleaned_url = self.clean_url(url)
	if self.is_valid_image(cleaned_url):
	images.append(cleaned_url)

	except Exception as e:
	print(f"Error searching images at {source_url}: {str(e)}")
	continue

	# Remove duplicates while preserving order
	seen = set()
	unique_images = [x for x in images if not (x in seen or seen.add(x))]

	# Filter out small images and suspicious URLs
	filtered_images = [img for img in unique_images if self.is_valid_image(img)]

	return filtered_images[:self.max_images]

	def is_image_url(self, url):
	"""Check if URL points to an image file"""
	image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp')
	return any(url.lower().endswith(ext) for ext in image_extensions)

	def is_valid_image(self, url):
	"""Additional validation for image URLs"""
	try:
	# Reject tiny images (often icons) and tracking pixels
	if re.search(r'(?:icon\|pixel\|tracker\|thumb\|logo\|button)\d*\.(?:jpg\|png\|gif)', url.lower()):
	return False

	# Avoid suspicious domains for images
	parsed = urlparse(url)
	if any(bad in parsed.netloc.lower() for bad in ["tracker", "pixel", "counter", "ad."]):
	return False

	# Avoid very short URLs (likely not valid images)
	if len(url) < 30:
	return False

	return True
	except:
	return False