File size: 3,680 Bytes
a91e05e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca10ef
a91e05e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca10ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a91e05e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ff75c3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import requests
from bs4 import BeautifulSoup

class Tool:
    """Base class for tools."""
    def __init__(self):
        self.name = None
        self.description = None
        self.inputs = {}
        self.output_type = None

    def forward(self, *args, **kwargs):
        raise NotImplementedError("Subclasses must implement this method.")

class SearchInformationTool(Tool):
    name = "web_search"
    description = "Perform a web search query and return the search results."
    inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
    inputs["filter_year"] = {
        "type": "string",
        "description": "[Optional parameter]: filter the search results to only include pages from a specific year.",
        "nullable": True,
    }
    output_type = "string"

    def __init__(self, browser):
        super().__init__()
        self.browser = browser

    def forward(self, query: str, filter_year: int | None = None) -> str:
        return self.browser.search_web(query, filter_year, filter_year)

class VisitTool(Tool):
    name = "visit_page"
    description = "Visit a webpage at a given URL and return its text."
    inputs = {"url": {"type": "string", "description": "The relative or absolute URL of the webpage to visit."}}
    output_type = "string"

    def __init__(self, browser=None):
        super().__init__()
        self.browser = browser

    def forward(self, url: str) -> str:
        self.browser.visit_page(url)
        header, content = self.browser._state()
        return header.strip() + "\n=======================\n" + content

class Browser:
    def __init__(self):
        self.current_page = None

    def search_web(self, query, start_year, end_year):
        url = f"https://www.google.com/search?q={query}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            results = soup.find_all('div', class_='tF2Cxc')
            search_results = []
            for idx, result in enumerate(results, start=1):
                result_text = result.get_text()
                search_results.append(f"Result {idx}: {result_text}\n")
                link = result.find('a', href=True)
                if link and 'wikipedia.org' in link['href']:
                    search_results.append(f"Found Wikipedia link: {link['href']}")
            return "\n".join(search_results)
        except requests.exceptions.RequestException as e:
            return f"An error occurred: {e}"

    def visit_page(self, url: str):
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator='\n', strip=True)
            self.current_page = {
                "url": url,
                "header": f"Header for {url}",
                "content": text
            }
        except requests.RequestException as e:
            print(f"An error occurred: {e}")
            self.current_page = {
                "url": url,
                "header": "Error",
                "content": f"Failed to retrieve the page: {e}"
            }

    def _state(self):
        if self.current_page:
            return self.current_page["header"], self.current_page["content"]
        return "", ""