Spaces:
Runtime error
Runtime error
from typing import Any, Optional, Dict | |
from smolagents.tools import Tool | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
from urllib.parse import urljoin, urlparse | |
class WebScrapingTool(Tool): | |
name = "web_scraping" | |
description = "Scrape content from web pages including text, links, and specific HTML elements" | |
inputs = { | |
'url': { | |
'type': 'string', | |
'description': 'The URL of the webpage to scrape', | |
'nullable': True | |
}, | |
'action': { | |
'type': 'string', | |
'description': 'The scraping action to perform: "text" (get all text), "links" (get all links), "element" (get specific elements)', | |
'default': 'text', | |
'nullable': True | |
}, | |
'selector': { | |
'type': 'string', | |
'description': 'CSS selector for specific elements (used with "element" action)', | |
'nullable': True | |
}, | |
'attributes': { | |
'type': 'array', | |
'description': 'List of attributes to extract from elements', | |
'items': {'type': 'string'}, | |
'nullable': True | |
} | |
} | |
output_type = "string" | |
def __init__(self): | |
super().__init__() | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
def _get_soup(self, url: str) -> BeautifulSoup: | |
"""Get BeautifulSoup object from URL.""" | |
try: | |
response = requests.get(url, headers=self.headers) | |
response.raise_for_status() | |
return BeautifulSoup(response.content, 'html.parser') | |
except Exception as e: | |
raise Exception(f"Error fetching URL: {str(e)}") | |
def _extract_text(self, soup: BeautifulSoup) -> str: | |
"""Extract all text from webpage.""" | |
# Remove script and style elements | |
for script in soup(["script", "style"]): | |
script.decompose() | |
text = soup.get_text(separator=' ', strip=True) | |
# Clean up whitespace | |
lines = (line.strip() for line in text.splitlines()) | |
return "\n".join(line for line in lines if line) | |
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> list: | |
"""Extract all links from webpage.""" | |
links = [] | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
text = link.get_text(strip=True) | |
absolute_url = urljoin(base_url, href) | |
links.append({ | |
'text': text, | |
'url': absolute_url | |
}) | |
return links | |
def _extract_elements(self, soup: BeautifulSoup, selector: str, attributes: Optional[list] = None) -> list: | |
"""Extract specific elements using CSS selector.""" | |
elements = [] | |
for element in soup.select(selector): | |
if not attributes: | |
elements.append(element.get_text(strip=True)) | |
else: | |
elem_data = {'text': element.get_text(strip=True)} | |
for attr in attributes: | |
elem_data[attr] = element.get(attr, '') | |
elements.append(elem_data) | |
return elements | |
def forward(self, url: Optional[str] = None, action: str = 'text', selector: Optional[str] = None, attributes: Optional[list] = None) -> str: | |
""" | |
Execute the web scraping operation. | |
Args: | |
url: The URL to scrape. Required for all operations. | |
action: The type of scraping to perform ('text', 'links', or 'element'). Defaults to 'text'. | |
selector: CSS selector for finding specific elements. Required for 'element' action. | |
attributes: List of attributes to extract from elements. Optional. | |
Returns: | |
str: JSON string containing the scraping results | |
""" | |
if not url: | |
return json.dumps({ | |
'error': 'URL is required', | |
'action': action | |
}, indent=2) | |
try: | |
soup = self._get_soup(url) | |
if action == 'text': | |
result = self._extract_text(soup) | |
elif action == 'links': | |
result = self._extract_links(soup, url) | |
elif action == 'element' and selector: | |
result = self._extract_elements(soup, selector, attributes) | |
else: | |
raise ValueError("Invalid action or missing selector for 'element' action. Available actions: 'text', 'links', 'element'.") | |
return json.dumps({ | |
'url': url, | |
'action': action, | |
'result': result | |
}, indent=2) | |
except Exception as e: | |
return json.dumps({ | |
'error': str(e), | |
'url': url, | |
'action': action | |
}, indent=2) | |