|
|
|
import requests
|
|
import os
|
|
from typing import List, Dict, Any, Optional
|
|
from .utils import get_env_var, logger
|
|
|
|
class SearchTools:
|
|
"""Free and cost-effective search tools with multiple providers"""
|
|
|
|
def __init__(self):
|
|
|
|
self.duckduckgo_enabled = True
|
|
|
|
|
|
self.tavily_api_key = os.getenv("TAVILY_API_KEY")
|
|
|
|
|
|
self.serpapi_key = os.getenv("SERPAPI_KEY")
|
|
|
|
def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""
|
|
Free search using DuckDuckGo Instant Answer API
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum number of results
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
try:
|
|
|
|
url = "https://api.duckduckgo.com/"
|
|
params = {
|
|
'q': query,
|
|
'format': 'json',
|
|
'no_html': '1',
|
|
'skip_disambig': '1'
|
|
}
|
|
|
|
response = requests.get(url, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
results = []
|
|
|
|
|
|
if data.get('Abstract'):
|
|
results.append({
|
|
'title': data.get('Heading', 'DuckDuckGo Result'),
|
|
'url': data.get('AbstractURL', ''),
|
|
'content': data.get('Abstract', ''),
|
|
'source': 'DuckDuckGo'
|
|
})
|
|
|
|
|
|
for topic in data.get('RelatedTopics', [])[:max_results-len(results)]:
|
|
if isinstance(topic, dict) and 'Text' in topic:
|
|
results.append({
|
|
'title': topic.get('Text', '')[:100],
|
|
'url': topic.get('FirstURL', ''),
|
|
'content': topic.get('Text', ''),
|
|
'source': 'DuckDuckGo'
|
|
})
|
|
|
|
return results[:max_results]
|
|
|
|
except Exception as e:
|
|
logger.error(f"DuckDuckGo search failed: {str(e)}")
|
|
return []
|
|
|
|
def search_tavily(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""
|
|
Search using Tavily API (cost-effective)
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum number of results
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
if not self.tavily_api_key:
|
|
logger.warning("Tavily API key not provided")
|
|
return []
|
|
|
|
try:
|
|
url = "https://api.tavily.com/search"
|
|
payload = {
|
|
"api_key": self.tavily_api_key,
|
|
"query": query,
|
|
"search_depth": "basic",
|
|
"include_answer": False,
|
|
"include_images": False,
|
|
"include_raw_content": False,
|
|
"max_results": max_results
|
|
}
|
|
|
|
response = requests.post(url, json=payload, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
results = []
|
|
|
|
for result in data.get('results', []):
|
|
results.append({
|
|
'title': result.get('title', ''),
|
|
'url': result.get('url', ''),
|
|
'content': result.get('content', ''),
|
|
'source': 'Tavily'
|
|
})
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Tavily search failed: {str(e)}")
|
|
return []
|
|
|
|
def search_serpapi(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""
|
|
Search using SerpAPI (expensive, fallback only)
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum number of results
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
if not self.serpapi_key:
|
|
logger.warning("SerpAPI key not provided")
|
|
return []
|
|
|
|
try:
|
|
url = "https://serpapi.com/search"
|
|
params = {
|
|
'api_key': self.serpapi_key,
|
|
'engine': 'google',
|
|
'q': query,
|
|
'num': max_results,
|
|
'gl': 'us',
|
|
'hl': 'en'
|
|
}
|
|
|
|
response = requests.get(url, params=params, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
results = []
|
|
|
|
for result in data.get('organic_results', []):
|
|
results.append({
|
|
'title': result.get('title', ''),
|
|
'url': result.get('link', ''),
|
|
'content': result.get('snippet', ''),
|
|
'source': 'Google (SerpAPI)'
|
|
})
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"SerpAPI search failed: {str(e)}")
|
|
return []
|
|
|
|
def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""
|
|
Comprehensive search using multiple providers with fallback strategy
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum number of results
|
|
|
|
Returns:
|
|
List of search results from best available provider
|
|
"""
|
|
if not query.strip():
|
|
return []
|
|
|
|
|
|
providers = [
|
|
("DuckDuckGo", self.search_duckduckgo),
|
|
("Tavily", self.search_tavily),
|
|
("SerpAPI", self.search_serpapi)
|
|
]
|
|
|
|
for provider_name, search_func in providers:
|
|
try:
|
|
logger.info(f"Attempting search with {provider_name}")
|
|
results = search_func(query, max_results)
|
|
|
|
if results:
|
|
logger.info(f"Successfully retrieved {len(results)} results from {provider_name}")
|
|
return results
|
|
else:
|
|
logger.warning(f"No results from {provider_name}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error with {provider_name}: {str(e)}")
|
|
continue
|
|
|
|
logger.error("All search providers failed")
|
|
return []
|
|
|
|
def search_news(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""Search for news articles"""
|
|
news_query = f"news {query}"
|
|
return self.search(news_query, max_results)
|
|
|
|
def search_academic(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""Search for academic content"""
|
|
academic_query = f"academic research {query} site:scholar.google.com OR site:arxiv.org OR site:researchgate.net"
|
|
return self.search(academic_query, max_results)
|
|
|
|
|
|
def search_web(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""Standalone function for web search"""
|
|
tools = SearchTools()
|
|
return tools.search(query, max_results)
|
|
|
|
def search_news(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
"""Standalone function for news search"""
|
|
tools = SearchTools()
|
|
return tools.search_news(query, max_results)
|
|
|