ngibs-ai-search / clean_fetch.py
jaihodigital's picture
Upload 6 files
a1dd0ca verified
import re
import requests
from typing import Dict, Any
import json
from bs4 import BeautifulSoup
class DataFetcher:
def __init__(self):
self.max_content_length = 2000
def clean_text(self, text: str) -> str:
"""Clean and format text content."""
if not text:
return ""
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Remove citations and brackets
text = re.sub(r'\[[\d\s,]+\]', '', text)
text = re.sub(r'\[[^\]]*\]', '', text)
# Remove parenthetical content
text = re.sub(r'\([^)]*\)', '', text)
# Clean whitespace
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n+', '\n', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s.,!?;:\-\n]', '', text)
return text.strip()
def fetch_duckduckgo(self, query: str) -> Dict[str, Any]:
"""Fetch search results from DuckDuckGo using scraping method."""
try:
print(f"πŸ” Searching DuckDuckGo for: {query}")
# Use DuckDuckGo search URL
search_url = f"https://duckduckgo.com/html/?q={requests.utils.quote(query)}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(search_url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract search results
results = []
result_divs = soup.find_all('div', class_='result')[:3] # Get top 3 results
for div in result_divs:
title_elem = div.find('a', class_='result__a')
snippet_elem = div.find('a', class_='result__snippet')
if title_elem and snippet_elem:
title = title_elem.get_text().strip()
snippet = snippet_elem.get_text().strip()
results.append(f"{title}: {snippet}")
if not results:
# Fallback: Try to get any text content
text_content = soup.get_text()
if len(text_content) > 100:
results.append(text_content[:500])
else:
return {
"success": False,
"content": "",
"source": "duckduckgo",
"error": "No search results found"
}
# Combine results
full_content = "\n".join(results)
cleaned_content = self.clean_text(full_content)
# Limit content length
if len(cleaned_content) > self.max_content_length:
cleaned_content = cleaned_content[:self.max_content_length] + "..."
print(f"βœ… Found {len(results)} search results")
return {
"success": True,
"content": cleaned_content,
"source": "duckduckgo",
"url": search_url,
"error": None
}
except Exception as e:
print(f"❌ DuckDuckGo search failed: {str(e)}")
return {
"success": False,
"content": "",
"source": "duckduckgo",
"error": f"Search failed: {str(e)}"
}
def fetch_simple_search(self, query: str) -> Dict[str, Any]:
"""Fallback simple search method."""
try:
# Use a simple search service or return a mock result for testing
content = f"Search results for: {query}\n\nThis is a simulated search result. The actual web search will provide real-time information from the internet."
return {
"success": True,
"content": content,
"source": "simple_search",
"url": "",
"error": None
}
except Exception as e:
return {
"success": False,
"content": "",
"source": "simple_search",
"error": f"Simple search failed: {str(e)}"
}
# Global instance
data_fetcher = DataFetcher()