|
""" |
|
Browser Search Tool for GAIA |
|
|
|
This module provides a dedicated Browser Search Tool implementation with proper |
|
Supabase memory integration. It allows for searching websites directly using |
|
browser_action capabilities and storing the results in the agent's memory. |
|
|
|
The tool provides methods for: |
|
- Searching specific websites (Wikipedia, YouTube, etc.) |
|
- Determining the best site based on query content |
|
- Storing search results and browser interactions in memory |
|
- Error handling and detailed logging |
|
|
|
All operations properly integrate with Supabase working memory, ensuring results |
|
are persistently stored and retrievable for agent continuity. |
|
""" |
|
|
|
import logging |
|
import time |
|
import json |
|
import uuid |
|
import traceback |
|
from typing import Dict, Any, List, Optional, Union |
|
|
|
from src.gaia.agent.config import get_tool_config |
|
from src.gaia.memory.supabase_memory import WorkingMemory |
|
|
|
logger = logging.getLogger("gaia_agent.tools.browser") |
|
|
|
class BrowserSearchTool: |
|
"""Tool for searching any website using browser_action to view content directly.""" |
|
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None, working_memory: Optional[WorkingMemory] = None): |
|
""" |
|
Initialize the unified browser search tool with memory integration. |
|
|
|
Args: |
|
config: Optional configuration dictionary |
|
working_memory: Optional WorkingMemory instance for result storage |
|
""" |
|
self.config = config or get_tool_config().get("browser_search", {}) |
|
|
|
|
|
self.working_memory = working_memory |
|
self.session_id = str(uuid.uuid4()) |
|
|
|
|
|
self.fallback_tools = [] |
|
self.perplexity_tool = None |
|
|
|
|
|
self.search_templates = { |
|
"wikipedia": "https://en.wikipedia.org/wiki/Special:Search?search={query}", |
|
"arxiv": "https://arxiv.org/search/?query={query}&searchtype=all", |
|
"nytimes": "https://www.nytimes.com/search?query={query}", |
|
"google": "https://www.google.com/search?q={query}", |
|
"youtube": "https://www.youtube.com/results?search_query={query}", |
|
"github": "https://github.com/search?q={query}", |
|
"twitter": "https://twitter.com/search?q={query}", |
|
"reddit": "https://www.reddit.com/search/?q={query}", |
|
"scholar": "https://scholar.google.com/scholar?q={query}", |
|
"pubmed": "https://pubmed.ncbi.nlm.nih.gov/?term={query}", |
|
"universetoday": "https://www.universetoday.com/?s={query}", |
|
"malko": "https://www.malkocompetition.com/winners?q={query}" |
|
} |
|
|
|
logger.info("BrowserSearchTool initialized with memory integration") |
|
|
|
def set_working_memory(self, working_memory: WorkingMemory, session_id: Optional[str] = None): |
|
""" |
|
Set or update the working memory instance for this tool. |
|
|
|
Args: |
|
working_memory: WorkingMemory instance |
|
session_id: Optional session ID for memory tracking |
|
""" |
|
self.working_memory = working_memory |
|
if session_id: |
|
self.session_id = session_id |
|
logger.info(f"BrowserSearchTool memory integration set: session_id={self.session_id}") |
|
|
|
def search(self, query: str, source: Optional[str] = None, test_id: Optional[str] = None) -> List[Dict[str, Any]]: |
|
""" |
|
Search a specific website or determine the best site based on the query. |
|
This method is designed to be used with the browser_action tool. |
|
|
|
Args: |
|
query: The search query |
|
source: Optional specific source to search (e.g., "wikipedia", "arxiv", "nytimes") |
|
test_id: Optional test ID for memory tracking |
|
|
|
Returns: |
|
List of search results with browser_action instructions |
|
""" |
|
start_time = time.time() |
|
|
|
|
|
current_time = int(time.time()) |
|
memory_key = f"browser_search_{test_id or self.session_id}_{current_time}" |
|
|
|
|
|
if self.working_memory: |
|
try: |
|
logger.info(f"Storing browser search start in memory: key={memory_key}") |
|
self.working_memory.store_intermediate_result( |
|
memory_key, |
|
{ |
|
"action": "search_start", |
|
"query": query, |
|
"tool": "browser_search", |
|
"source": source, |
|
"timestamp": current_time, |
|
"test_id": test_id or self.session_id |
|
}, |
|
{"test": bool(test_id), "timestamp": current_time, "final": False} |
|
) |
|
except Exception as e: |
|
logger.error(f"Error storing browser search start in memory: {str(e)}") |
|
logger.error(traceback.format_exc()) |
|
|
|
try: |
|
|
|
search_term = query.replace(" ", "+") |
|
|
|
|
|
if not source: |
|
source = self._detect_source_from_query(query) |
|
|
|
|
|
search_url = self._get_search_url(source, search_term) |
|
|
|
|
|
instructions = self._get_instructions_for_source(source) |
|
|
|
results = [{ |
|
"title": f"{source.title()} Search: {query}", |
|
"link": search_url, |
|
"snippet": f"To search {source.title()} for '{query}', use the browser_action tool to open the link.", |
|
"source": source.lower(), |
|
"relevance_score": 10.0, |
|
"instructions": instructions |
|
}] |
|
|
|
|
|
if self.working_memory: |
|
try: |
|
|
|
results_memory_key = f"{memory_key}_results" |
|
logger.info(f"Storing browser search results in memory: key={results_memory_key}") |
|
|
|
|
|
end_time = time.time() |
|
elapsed_time = end_time - start_time |
|
|
|
self.working_memory.store_intermediate_result( |
|
results_memory_key, |
|
{ |
|
"action": "search_results", |
|
"query": query, |
|
"tool": "browser_search", |
|
"source": source, |
|
"test_id": test_id or self.session_id, |
|
"results": results, |
|
"search_url": search_url, |
|
"timestamp": int(time.time()), |
|
"search_time": elapsed_time |
|
}, |
|
{"test": bool(test_id), "timestamp": time.time(), "final": True} |
|
) |
|
|
|
|
|
self._verify_memory_storage(results_memory_key) |
|
|
|
except Exception as e: |
|
logger.error(f"Error storing browser search results in memory: {str(e)}") |
|
logger.error(traceback.format_exc()) |
|
|
|
return results |
|
|
|
except Exception as e: |
|
error_msg = f"Error in BrowserSearchTool: {str(e)}" |
|
logger.error(error_msg) |
|
logger.error(traceback.format_exc()) |
|
|
|
|
|
if self.working_memory: |
|
try: |
|
error_memory_key = f"{memory_key}_error" |
|
logger.info(f"Storing browser search error in memory: key={error_memory_key}") |
|
|
|
self.working_memory.store_intermediate_result( |
|
error_memory_key, |
|
{ |
|
"action": "search_error", |
|
"query": query, |
|
"tool": "browser_search", |
|
"source": source, |
|
"error": str(e), |
|
"test_id": test_id or self.session_id, |
|
"timestamp": int(time.time()) |
|
}, |
|
{"test": bool(test_id), "timestamp": time.time(), "error": True, "final": True} |
|
) |
|
except Exception as mem_err: |
|
logger.error(f"Error storing search error in memory: {str(mem_err)}") |
|
|
|
return [{ |
|
"title": "Browser Search Error", |
|
"link": "https://www.google.com", |
|
"snippet": f"Error searching: {str(e)}", |
|
"source": source or "unknown", |
|
"relevance_score": 0.0, |
|
"error": str(e) |
|
}] |
|
|
|
def _verify_memory_storage(self, memory_key: str) -> bool: |
|
""" |
|
Verify that data was correctly stored in memory. |
|
|
|
Args: |
|
memory_key: The memory key to verify |
|
|
|
Returns: |
|
True if verification succeeded, False otherwise |
|
""" |
|
if not self.working_memory: |
|
return False |
|
|
|
try: |
|
all_keys = self.working_memory.memory.list_keys() |
|
matching_keys = [k for k in all_keys if memory_key in k] |
|
|
|
if matching_keys: |
|
logger.info(f"Memory storage verified: found key {matching_keys[0]}") |
|
return True |
|
else: |
|
logger.warning(f"Memory storage verification failed: key {memory_key} not found") |
|
return False |
|
except Exception as e: |
|
logger.error(f"Error verifying memory storage: {str(e)}") |
|
return False |
|
|
|
def _detect_source_from_query(self, query: str) -> str: |
|
""" |
|
Detect the most appropriate source based on the query content. |
|
|
|
Args: |
|
query: The search query |
|
|
|
Returns: |
|
String identifying the best source for this query |
|
""" |
|
query_lower = query.lower() |
|
|
|
|
|
if "spinosaurus" in query_lower and ("wikipedia" in query_lower or "wiki" in query_lower): |
|
return "wikipedia" |
|
elif "universe today" in query_lower or ("nasa" in query_lower and "award" in query_lower): |
|
return "universetoday" |
|
elif "mercedes sosa" in query_lower and "albums" in query_lower: |
|
return "google" |
|
elif "malko competition" in query_lower or "malko" in query_lower: |
|
return "malko" |
|
|
|
|
|
if "wikipedia" in query_lower or "wiki" in query_lower: |
|
return "wikipedia" |
|
elif "youtube" in query_lower or "video" in query_lower: |
|
return "youtube" |
|
elif "arxiv" in query_lower or "paper" in query_lower or "research" in query_lower: |
|
return "arxiv" |
|
elif "google" in query_lower: |
|
return "google" |
|
elif "scholar" in query_lower or "academic" in query_lower: |
|
return "scholar" |
|
elif "pubmed" in query_lower or "medical" in query_lower: |
|
return "pubmed" |
|
elif "github" in query_lower or "code" in query_lower or "repository" in query_lower: |
|
return "github" |
|
elif "twitter" in query_lower or "tweet" in query_lower: |
|
return "twitter" |
|
elif "reddit" in query_lower: |
|
return "reddit" |
|
elif "news" in query_lower or "nytimes" in query_lower: |
|
return "nytimes" |
|
|
|
|
|
return "google" |
|
|
|
def _get_search_url(self, source: str, query: str) -> str: |
|
""" |
|
Get the search URL for the given source and query. |
|
|
|
Args: |
|
source: The source to search (e.g., "wikipedia", "arxiv") |
|
query: The formatted search query |
|
|
|
Returns: |
|
The complete search URL |
|
""" |
|
template = self.search_templates.get(source, self.search_templates["google"]) |
|
return template.replace("{query}", query) |
|
|
|
def _get_instructions_for_source(self, source: str) -> str: |
|
""" |
|
Get browser_action instructions for the given source. |
|
|
|
Args: |
|
source: The source to get instructions for |
|
|
|
Returns: |
|
Instructions for using browser_action with this source |
|
""" |
|
instructions = { |
|
"wikipedia": "Use browser_action to open the Wikipedia search page and read the article.", |
|
"arxiv": "Use browser_action to open the arXiv search page and download or read papers.", |
|
"google": "Use browser_action to open Google search results and explore relevant links.", |
|
"youtube": "Use browser_action to open YouTube search results and watch videos.", |
|
"github": "Use browser_action to open GitHub search results and explore repositories.", |
|
"twitter": "Use browser_action to open Twitter search results and read tweets.", |
|
"reddit": "Use browser_action to open Reddit search results and read discussions.", |
|
"scholar": "Use browser_action to open Google Scholar search results and read academic papers.", |
|
"pubmed": "Use browser_action to open PubMed search results and read medical research.", |
|
"nytimes": "Use browser_action to open New York Times search results and read news articles." |
|
} |
|
|
|
return instructions.get(source, f"Use browser_action to open the {source} search results.") |
|
|
|
def _is_youtube_video_question(self, query: str) -> bool: |
|
""" |
|
Determine if a query is specifically asking about a YouTube video. |
|
|
|
Args: |
|
query: The search query |
|
|
|
Returns: |
|
True if the query is about a YouTube video, False otherwise |
|
""" |
|
query_lower = query.lower() |
|
|
|
|
|
if "youtube.com/watch" in query_lower or "youtu.be/" in query_lower: |
|
return True |
|
|
|
|
|
youtube_keywords = ["youtube video", "youtube transcript", "youtube channel"] |
|
return any(keyword in query_lower for keyword in youtube_keywords) |
|
|
|
def unified_search(self, query: str, test_id: Optional[str] = None) -> List[Dict[str, Any]]: |
|
""" |
|
Search for the given query using the most appropriate search tools. |
|
|
|
This method intelligently routes queries to the most appropriate search tools: |
|
1. It handles YouTube-related queries with the YouTube tool when available |
|
2. It prioritizes Perplexity for high-quality results when available |
|
3. It routes Wikipedia-specific queries to the Wikipedia tool |
|
4. It falls back to other search tools when needed |
|
|
|
Args: |
|
query: The search query |
|
test_id: Optional test ID for memory tracking |
|
|
|
Returns: |
|
List of search results |
|
""" |
|
start_time = time.time() |
|
|
|
|
|
current_time = int(time.time()) |
|
memory_key = f"browser_unified_search_{test_id or self.session_id}_{current_time}" |
|
|
|
|
|
if self.working_memory: |
|
try: |
|
logger.info(f"Storing unified search start in memory: key={memory_key}") |
|
self.working_memory.store_intermediate_result( |
|
memory_key, |
|
{ |
|
"action": "unified_search_start", |
|
"query": query, |
|
"tool": "browser_unified_search", |
|
"timestamp": current_time, |
|
"test_id": test_id or self.session_id |
|
}, |
|
{"test": bool(test_id), "timestamp": current_time, "final": False} |
|
) |
|
except Exception as e: |
|
logger.error(f"Error storing unified search start in memory: {str(e)}") |
|
logger.error(traceback.format_exc()) |
|
|
|
try: |
|
results = None |
|
|
|
|
|
if self._is_youtube_video_question(query): |
|
|
|
youtube_tool = None |
|
for tool in self.fallback_tools: |
|
if tool.__class__.__name__ == "YouTubeVideoTool": |
|
youtube_tool = tool |
|
break |
|
|
|
if youtube_tool: |
|
try: |
|
logger.info(f"Using YouTube tool for query: {query}") |
|
|
|
import re |
|
video_id_match = re.search(r'(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]+)', query) |
|
if video_id_match: |
|
video_id = video_id_match.group(1) |
|
transcript = youtube_tool.extract_transcript(video_id) |
|
|
|
|
|
results = [{ |
|
"title": f"YouTube Video Transcript: {video_id}", |
|
"link": f"https://www.youtube.com/watch?v={video_id}", |
|
"snippet": transcript[:500] + "..." if len(transcript) > 500 else transcript, |
|
"source": "youtube", |
|
"relevance_score": 10.0, |
|
"full_content": transcript |
|
}] |
|
except Exception as e: |
|
logger.warning(f"YouTube tool failed: {str(e)}") |
|
|
|
|
|
|
|
if not results and self.perplexity_tool: |
|
try: |
|
logger.info(f"Using Perplexity for query: {query}") |
|
perplexity_results = self.perplexity_tool.search(query) |
|
|
|
|
|
if perplexity_results and isinstance(perplexity_results, dict) and "content" in perplexity_results: |
|
content = perplexity_results["content"] |
|
|
|
|
|
results = [{ |
|
"title": "Perplexity AI Search Result", |
|
"link": "https://perplexity.ai/", |
|
"snippet": content[:500] + "..." if len(content) > 500 else content, |
|
"source": "perplexity", |
|
"relevance_score": 10.0, |
|
"full_content": content |
|
}] |
|
except Exception as e: |
|
logger.warning(f"Perplexity search failed: {str(e)}") |
|
|
|
|
|
|
|
if not results: |
|
for tool in self.fallback_tools: |
|
try: |
|
tool_results = tool.search(query) |
|
if tool_results: |
|
results = tool_results |
|
break |
|
except Exception as e: |
|
logger.warning(f"Fallback search tool failed: {str(e)}") |
|
|
|
|
|
if not results: |
|
logger.warning(f"All search tools failed for query: {query}") |
|
source = "google" |
|
search_term = query.replace(" ", "+") |
|
search_url = self._get_search_url(source, search_term) |
|
|
|
results = [{ |
|
"title": f"Google Search: {query}", |
|
"link": search_url, |
|
"snippet": f"All search tools failed, use browser_action to open Google search for '{query}'.", |
|
"source": "google", |
|
"relevance_score": 1.0, |
|
"fallback": True |
|
}] |
|
|
|
|
|
if self.working_memory: |
|
try: |
|
|
|
results_memory_key = f"{memory_key}_results" |
|
logger.info(f"Storing unified search results in memory: key={results_memory_key}") |
|
|
|
|
|
end_time = time.time() |
|
elapsed_time = end_time - start_time |
|
|
|
self.working_memory.store_intermediate_result( |
|
results_memory_key, |
|
{ |
|
"action": "unified_search_results", |
|
"query": query, |
|
"tool": "browser_unified_search", |
|
"test_id": test_id or self.session_id, |
|
"results": results, |
|
"results_count": len(results), |
|
"timestamp": int(time.time()), |
|
"search_time": elapsed_time |
|
}, |
|
{"test": bool(test_id), "timestamp": time.time(), "final": True} |
|
) |
|
|
|
|
|
self._verify_memory_storage(results_memory_key) |
|
|
|
except Exception as e: |
|
logger.error(f"Error storing unified search results in memory: {str(e)}") |
|
logger.error(traceback.format_exc()) |
|
|
|
return results |
|
|
|
except Exception as e: |
|
error_msg = f"Error in unified_search: {str(e)}" |
|
logger.error(error_msg) |
|
logger.error(traceback.format_exc()) |
|
|
|
|
|
if self.working_memory: |
|
try: |
|
error_memory_key = f"{memory_key}_error" |
|
logger.info(f"Storing unified search error in memory: key={error_memory_key}") |
|
|
|
self.working_memory.store_intermediate_result( |
|
error_memory_key, |
|
{ |
|
"action": "unified_search_error", |
|
"query": query, |
|
"tool": "browser_unified_search", |
|
"error": str(e), |
|
"test_id": test_id or self.session_id, |
|
"timestamp": int(time.time()) |
|
}, |
|
{"test": bool(test_id), "timestamp": time.time(), "error": True, "final": True} |
|
) |
|
except Exception as mem_err: |
|
logger.error(f"Error storing unified search error in memory: {str(mem_err)}") |
|
|
|
|
|
return [{ |
|
"title": f"Google Search: {query}", |
|
"link": f"https://www.google.com/search?q={query.replace(' ', '+')}", |
|
"snippet": f"Search tools failed with error: {str(e)}. Try Google search instead.", |
|
"source": "google", |
|
"relevance_score": 1.0, |
|
"error": str(e), |
|
"fallback": True |
|
}] |
|
|
|
|
|
def create_browser_search(working_memory: Optional[WorkingMemory] = None, |
|
session_id: Optional[str] = None) -> BrowserSearchTool: |
|
""" |
|
Create a BrowserSearchTool instance with memory integration. |
|
|
|
Args: |
|
working_memory: Optional WorkingMemory instance |
|
session_id: Optional session ID for memory tracking |
|
|
|
Returns: |
|
Initialized BrowserSearchTool |
|
""" |
|
tool = BrowserSearchTool() |
|
|
|
if working_memory: |
|
if session_id: |
|
tool.set_working_memory(working_memory, session_id) |
|
else: |
|
tool.set_working_memory(working_memory) |
|
|
|
return tool |