HF_GAIA_AGENT

Runtime error

HF_GAIA_AGENT / tools /web_scraping.py

Euclides H.

Tools fixes and agent PROMPT improve

5212a79 3 months ago

4.98 kB

	from typing import Any, Optional, Dict
	from smolagents.tools import Tool
	import requests
	from bs4 import BeautifulSoup
	import json
	from urllib.parse import urljoin, urlparse

	class WebScrapingTool(Tool):
	name = "web_scraping"
	description = "Scrape content from web pages including text, links, and specific HTML elements"
	inputs = {
	'url': {
	'type': 'string',
	'description': 'The URL of the webpage to scrape',
	'nullable': True
	},
	'action': {
	'type': 'string',
	'description': 'The scraping action to perform: "text" (get all text), "links" (get all links), "element" (get specific elements)',
	'default': 'text',
	'nullable': True
	},
	'selector': {
	'type': 'string',
	'description': 'CSS selector for specific elements (used with "element" action)',
	'nullable': True
	},
	'attributes': {
	'type': 'array',
	'description': 'List of attributes to extract from elements',
	'items': {'type': 'string'},
	'nullable': True
	}
	}
	output_type = "string"

	def __init__(self):
	super().__init__()
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	def _get_soup(self, url: str) -> BeautifulSoup:
	"""Get BeautifulSoup object from URL."""
	try:
	response = requests.get(url, headers=self.headers)
	response.raise_for_status()
	return BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	raise Exception(f"Error fetching URL: {str(e)}")

	def _extract_text(self, soup: BeautifulSoup) -> str:
	"""Extract all text from webpage."""
	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	text = soup.get_text(separator=' ', strip=True)
	# Clean up whitespace
	lines = (line.strip() for line in text.splitlines())
	return "\n".join(line for line in lines if line)

	def _extract_links(self, soup: BeautifulSoup, base_url: str) -> list:
	"""Extract all links from webpage."""
	links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	text = link.get_text(strip=True)
	absolute_url = urljoin(base_url, href)
	links.append({
	'text': text,
	'url': absolute_url
	})
	return links

	def _extract_elements(self, soup: BeautifulSoup, selector: str, attributes: Optional[list] = None) -> list:
	"""Extract specific elements using CSS selector."""
	elements = []
	for element in soup.select(selector):
	if not attributes:
	elements.append(element.get_text(strip=True))
	else:
	elem_data = {'text': element.get_text(strip=True)}
	for attr in attributes:
	elem_data[attr] = element.get(attr, '')
	elements.append(elem_data)
	return elements

	def forward(self, url: Optional[str] = None, action: str = 'text', selector: Optional[str] = None, attributes: Optional[list] = None) -> str:
	"""
	Execute the web scraping operation.

	Args:
	url: The URL to scrape. Required for all operations.
	action: The type of scraping to perform ('text', 'links', or 'element'). Defaults to 'text'.
	selector: CSS selector for finding specific elements. Required for 'element' action.
	attributes: List of attributes to extract from elements. Optional.

	Returns:
	str: JSON string containing the scraping results
	"""
	if not url:
	return json.dumps({
	'error': 'URL is required',
	'action': action
	}, indent=2)

	try:
	soup = self._get_soup(url)

	if action == 'text':
	result = self._extract_text(soup)
	elif action == 'links':
	result = self._extract_links(soup, url)
	elif action == 'element' and selector:
	result = self._extract_elements(soup, selector, attributes)
	else:
	raise ValueError("Invalid action or missing selector for 'element' action. Available actions: 'text', 'links', 'element'.")

	return json.dumps({
	'url': url,
	'action': action,
	'result': result
	}, indent=2)

	except Exception as e:
	return json.dumps({
	'error': str(e),
	'url': url,
	'action': action
	}, indent=2)