Scaper_search / search.py
gaur3009's picture
Update search.py
1ac9fd6 verified
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import re
from functools import lru_cache
# Clean HTML tags
TAG_CLEANER = re.compile(r"<[^>]+>")
@lru_cache(maxsize=500)
def extract_metadata(url):
"""Extract title and description from URL"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
'Accept-Language': 'en-US,en;q=0.9'
}
response = requests.get(url, timeout=5, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
title = soup.title.string.strip() if soup.title else url
title = title[:200]
# Try to get description
description = ""
if meta_desc := soup.find("meta", attrs={"name": "description"}):
description = meta_desc.get("content", "")[:300]
return {
"url": url,
"title": title,
"description": description
}
except Exception as e:
return {"url": url, "title": f"Error: {str(e)[:30]}", "description": ""}
def search_google(query, num_results=5):
"""Search with enhanced result parsing"""
try:
# Get search results
urls = list(search(query, num_results=num_results, advanced=False))
# Extract metadata for each URL
results = []
for url in urls[:num_results]:
if metadata := extract_metadata(url):
results.append(metadata)
return results
except Exception as e:
print(f"Search error: {e}")
return []