File size: 4,981 Bytes
b617c72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5212a79
b617c72
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from typing import Any, Optional, Dict
from smolagents.tools import Tool
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin, urlparse

class WebScrapingTool(Tool):
    name = "web_scraping"
    description = "Scrape content from web pages including text, links, and specific HTML elements"
    inputs = {
        'url': {
            'type': 'string',
            'description': 'The URL of the webpage to scrape',
            'nullable': True
        },
        'action': {
            'type': 'string',
            'description': 'The scraping action to perform: "text" (get all text), "links" (get all links), "element" (get specific elements)',
            'default': 'text',
            'nullable': True
        },
        'selector': {
            'type': 'string',
            'description': 'CSS selector for specific elements (used with "element" action)',
            'nullable': True
        },
        'attributes': {
            'type': 'array',
            'description': 'List of attributes to extract from elements',
            'items': {'type': 'string'},
            'nullable': True
        }
    }
    output_type = "string"

    def __init__(self):
        super().__init__()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def _get_soup(self, url: str) -> BeautifulSoup:
        """Get BeautifulSoup object from URL."""
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            raise Exception(f"Error fetching URL: {str(e)}")

    def _extract_text(self, soup: BeautifulSoup) -> str:
        """Extract all text from webpage."""
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text(separator=' ', strip=True)
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        return "\n".join(line for line in lines if line)

    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> list:
        """Extract all links from webpage."""
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            text = link.get_text(strip=True)
            absolute_url = urljoin(base_url, href)
            links.append({
                'text': text,
                'url': absolute_url
            })
        return links

    def _extract_elements(self, soup: BeautifulSoup, selector: str, attributes: Optional[list] = None) -> list:
        """Extract specific elements using CSS selector."""
        elements = []
        for element in soup.select(selector):
            if not attributes:
                elements.append(element.get_text(strip=True))
            else:
                elem_data = {'text': element.get_text(strip=True)}
                for attr in attributes:
                    elem_data[attr] = element.get(attr, '')
                elements.append(elem_data)
        return elements

    def forward(self, url: Optional[str] = None, action: str = 'text', selector: Optional[str] = None, attributes: Optional[list] = None) -> str:
        """
        Execute the web scraping operation.
        
        Args:
            url: The URL to scrape. Required for all operations.
            action: The type of scraping to perform ('text', 'links', or 'element'). Defaults to 'text'.
            selector: CSS selector for finding specific elements. Required for 'element' action.
            attributes: List of attributes to extract from elements. Optional.
            
        Returns:
            str: JSON string containing the scraping results
        """
        if not url:
            return json.dumps({
                'error': 'URL is required',
                'action': action
            }, indent=2)

        try:
            soup = self._get_soup(url)
            
            if action == 'text':
                result = self._extract_text(soup)
            elif action == 'links':
                result = self._extract_links(soup, url)
            elif action == 'element' and selector:
                result = self._extract_elements(soup, selector, attributes)
            else:
                raise ValueError("Invalid action or missing selector for 'element' action. Available actions: 'text', 'links', 'element'.")
            
            return json.dumps({
                'url': url,
                'action': action,
                'result': result
            }, indent=2)
            
        except Exception as e:
            return json.dumps({
                'error': str(e),
                'url': url,
                'action': action
            }, indent=2)