Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools /web_search.py

navodit17

agent with search, file read, youtube

396f5a0 2 months ago

raw

history blame contribute delete

1.94 kB

	from smolagents import tool

	@tool
	def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
	"""
	Visits a website and fetches the content of a given URL / webpage.
	if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
	Args:
	url (str): The URL to fetch.
	convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
	Returns:
	str: The HTML content of the URL.
	"""
	import requests
	from bs4 import BeautifulSoup
	from markdownify import markdownify as md

	content = None
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, timeout=30, headers=headers)
	# print(response.text)
	if response.text is not None:
	print("not none")
	if (convert_to_markdown):
	soup = BeautifulSoup(response.text, "html.parser")
	# remove script and style tags
	for script in soup(["script", "style"]):
	script.extract()

	# for wikipedia only keep the main content
	if "wikipedia.org" in url:
	main_content = soup.find("main",{"id":"content"})
	if main_content:
	content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
	else:
	content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
	else:
	# Fallback for all other sites - from chatgpt - not tested
	content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
	else:
	content = response.text

	# save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")

	return content