import re import requests from typing import Dict, Any import json from bs4 import BeautifulSoup class DataFetcher: def __init__(self): self.max_content_length = 2000 def clean_text(self, text: str) -> str: """Clean and format text content.""" if not text: return "" # Remove HTML tags text = re.sub(r'<[^>]+>', '', text) # Remove citations and brackets text = re.sub(r'\[[\d\s,]+\]', '', text) text = re.sub(r'\[[^\]]*\]', '', text) # Remove parenthetical content text = re.sub(r'\([^)]*\)', '', text) # Clean whitespace text = re.sub(r'\s+', ' ', text) text = re.sub(r'\n+', '\n', text) # Remove special characters but keep punctuation text = re.sub(r'[^\w\s.,!?;:\-\n]', '', text) return text.strip() def fetch_duckduckgo(self, query: str) -> Dict[str, Any]: """Fetch search results from DuckDuckGo using scraping method.""" try: print(f"🔍 Searching DuckDuckGo for: {query}") # Use DuckDuckGo search URL search_url = f"https://duckduckgo.com/html/?q={requests.utils.quote(query)}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(search_url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract search results results = [] result_divs = soup.find_all('div', class_='result')[:3] # Get top 3 results for div in result_divs: title_elem = div.find('a', class_='result__a') snippet_elem = div.find('a', class_='result__snippet') if title_elem and snippet_elem: title = title_elem.get_text().strip() snippet = snippet_elem.get_text().strip() results.append(f"{title}: {snippet}") if not results: # Fallback: Try to get any text content text_content = soup.get_text() if len(text_content) > 100: results.append(text_content[:500]) else: return { "success": False, "content": "", "source": "duckduckgo", "error": "No search results found" } # Combine results full_content = "\n".join(results) cleaned_content = self.clean_text(full_content) # Limit content length if len(cleaned_content) > self.max_content_length: cleaned_content = cleaned_content[:self.max_content_length] + "..." print(f"✅ Found {len(results)} search results") return { "success": True, "content": cleaned_content, "source": "duckduckgo", "url": search_url, "error": None } except Exception as e: print(f"❌ DuckDuckGo search failed: {str(e)}") return { "success": False, "content": "", "source": "duckduckgo", "error": f"Search failed: {str(e)}" } def fetch_simple_search(self, query: str) -> Dict[str, Any]: """Fallback simple search method.""" try: # Use a simple search service or return a mock result for testing content = f"Search results for: {query}\n\nThis is a simulated search result. The actual web search will provide real-time information from the internet." return { "success": True, "content": content, "source": "simple_search", "url": "", "error": None } except Exception as e: return { "success": False, "content": "", "source": "simple_search", "error": f"Simple search failed: {str(e)}" } # Global instance data_fetcher = DataFetcher()