from smolagents import tool @tool def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: """ Visits a website and fetches the content of a given URL / webpage. if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML Args: url (str): The URL to fetch. convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. Returns: str: The HTML content of the URL. """ import requests from bs4 import BeautifulSoup from markdownify import markdownify as md content = None headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, timeout=30, headers=headers) # print(response.text) if response.text is not None: print("not none") if (convert_to_markdown): soup = BeautifulSoup(response.text, "html.parser") # remove script and style tags for script in soup(["script", "style"]): script.extract() # for wikipedia only keep the main content if "wikipedia.org" in url: main_content = soup.find("main",{"id":"content"}) if main_content: content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() else: content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() else: # Fallback for all other sites - from chatgpt - not tested content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip() else: content = response.text # save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html") return content