from googlesearch import search import requests from bs4 import BeautifulSoup import re from functools import lru_cache # Clean HTML tags TAG_CLEANER = re.compile(r"<[^>]+>") @lru_cache(maxsize=500) def extract_metadata(url): """Extract title and description from URL""" try: headers = { 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)', 'Accept-Language': 'en-US,en;q=0.9' } response = requests.get(url, timeout=5, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, 'lxml') title = soup.title.string.strip() if soup.title else url title = title[:200] # Try to get description description = "" if meta_desc := soup.find("meta", attrs={"name": "description"}): description = meta_desc.get("content", "")[:300] return { "url": url, "title": title, "description": description } except Exception as e: return {"url": url, "title": f"Error: {str(e)[:30]}", "description": ""} def search_google(query, num_results=5): """Search with enhanced result parsing""" try: # Get search results urls = list(search(query, num_results=num_results, advanced=False)) # Extract metadata for each URL results = [] for url in urls[:num_results]: if metadata := extract_metadata(url): results.append(metadata) return results except Exception as e: print(f"Search error: {e}") return []