Spaces:
Sleeping
Sleeping
from smolagents import tool | |
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: | |
""" | |
Visits a website and fetches the content of a given URL / webpage. | |
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML | |
Args: | |
url (str): The URL to fetch. | |
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. | |
Returns: | |
str: The HTML content of the URL. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from markdownify import markdownify as md | |
content = None | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, timeout=30, headers=headers) | |
# print(response.text) | |
if response.text is not None: | |
print("not none") | |
if (convert_to_markdown): | |
soup = BeautifulSoup(response.text, "html.parser") | |
# remove script and style tags | |
for script in soup(["script", "style"]): | |
script.extract() | |
# for wikipedia only keep the main content | |
if "wikipedia.org" in url: | |
main_content = soup.find("main",{"id":"content"}) | |
if main_content: | |
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
# Fallback for all other sites - from chatgpt - not tested | |
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = response.text | |
# save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html") | |
return content | |