navodit17's picture
agent with search, file read, youtube
396f5a0
from smolagents import tool
@tool
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
"""
Visits a website and fetches the content of a given URL / webpage.
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
Args:
url (str): The URL to fetch.
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
Returns:
str: The HTML content of the URL.
"""
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
content = None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, timeout=30, headers=headers)
# print(response.text)
if response.text is not None:
print("not none")
if (convert_to_markdown):
soup = BeautifulSoup(response.text, "html.parser")
# remove script and style tags
for script in soup(["script", "style"]):
script.extract()
# for wikipedia only keep the main content
if "wikipedia.org" in url:
main_content = soup.find("main",{"id":"content"})
if main_content:
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
else:
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
else:
# Fallback for all other sites - from chatgpt - not tested
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
else:
content = response.text
# save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")
return content