Spaces:
Running
Running
import re | |
import requests | |
from typing import Dict, Any | |
import json | |
from bs4 import BeautifulSoup | |
class DataFetcher: | |
def __init__(self): | |
self.max_content_length = 2000 | |
def clean_text(self, text: str) -> str: | |
"""Clean and format text content.""" | |
if not text: | |
return "" | |
# Remove HTML tags | |
text = re.sub(r'<[^>]+>', '', text) | |
# Remove citations and brackets | |
text = re.sub(r'\[[\d\s,]+\]', '', text) | |
text = re.sub(r'\[[^\]]*\]', '', text) | |
# Remove parenthetical content | |
text = re.sub(r'\([^)]*\)', '', text) | |
# Clean whitespace | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'\n+', '\n', text) | |
# Remove special characters but keep punctuation | |
text = re.sub(r'[^\w\s.,!?;:\-\n]', '', text) | |
return text.strip() | |
def fetch_duckduckgo(self, query: str) -> Dict[str, Any]: | |
"""Fetch search results from DuckDuckGo using scraping method.""" | |
try: | |
print(f"π Searching DuckDuckGo for: {query}") | |
# Use DuckDuckGo search URL | |
search_url = f"https://duckduckgo.com/html/?q={requests.utils.quote(query)}" | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(search_url, headers=headers, timeout=15) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract search results | |
results = [] | |
result_divs = soup.find_all('div', class_='result')[:3] # Get top 3 results | |
for div in result_divs: | |
title_elem = div.find('a', class_='result__a') | |
snippet_elem = div.find('a', class_='result__snippet') | |
if title_elem and snippet_elem: | |
title = title_elem.get_text().strip() | |
snippet = snippet_elem.get_text().strip() | |
results.append(f"{title}: {snippet}") | |
if not results: | |
# Fallback: Try to get any text content | |
text_content = soup.get_text() | |
if len(text_content) > 100: | |
results.append(text_content[:500]) | |
else: | |
return { | |
"success": False, | |
"content": "", | |
"source": "duckduckgo", | |
"error": "No search results found" | |
} | |
# Combine results | |
full_content = "\n".join(results) | |
cleaned_content = self.clean_text(full_content) | |
# Limit content length | |
if len(cleaned_content) > self.max_content_length: | |
cleaned_content = cleaned_content[:self.max_content_length] + "..." | |
print(f"β Found {len(results)} search results") | |
return { | |
"success": True, | |
"content": cleaned_content, | |
"source": "duckduckgo", | |
"url": search_url, | |
"error": None | |
} | |
except Exception as e: | |
print(f"β DuckDuckGo search failed: {str(e)}") | |
return { | |
"success": False, | |
"content": "", | |
"source": "duckduckgo", | |
"error": f"Search failed: {str(e)}" | |
} | |
def fetch_simple_search(self, query: str) -> Dict[str, Any]: | |
"""Fallback simple search method.""" | |
try: | |
# Use a simple search service or return a mock result for testing | |
content = f"Search results for: {query}\n\nThis is a simulated search result. The actual web search will provide real-time information from the internet." | |
return { | |
"success": True, | |
"content": content, | |
"source": "simple_search", | |
"url": "", | |
"error": None | |
} | |
except Exception as e: | |
return { | |
"success": False, | |
"content": "", | |
"source": "simple_search", | |
"error": f"Simple search failed: {str(e)}" | |
} | |
# Global instance | |
data_fetcher = DataFetcher() | |