muhammadnoman76 commited on
Commit
b3b4203
·
1 Parent(s): 958ee21
Files changed (1) hide show
  1. app/services/websearch.py +258 -51
app/services/websearch.py CHANGED
@@ -5,11 +5,12 @@ from bs4 import BeautifulSoup
5
  import urllib.parse
6
  import time
7
  import random
 
8
 
9
  warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
10
 
11
  class WebSearch:
12
- def __init__(self, num_results=4, max_chars_per_page=6000 , max_images=10):
13
  self.num_results = num_results
14
  self.max_chars_per_page = max_chars_per_page
15
  self.reference = []
@@ -23,13 +24,138 @@ class WebSearch:
23
  'DNT': '1',
24
  'Connection': 'keep-alive',
25
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def extract_text_from_webpage(self, html_content):
28
  soup = BeautifulSoup(html_content, "html.parser")
29
- for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg"]):
 
 
 
30
  tag.extract()
31
- visible_text = soup.get_text(strip=True)
32
- return visible_text
 
 
 
 
33
 
34
  def search(self, query):
35
  results = []
@@ -42,32 +168,86 @@ class WebSearch:
42
 
43
  response = session.get(url, timeout=10)
44
  soup = BeautifulSoup(response.text, 'html.parser')
45
- search_results = soup.find_all('div', class_='result')[:self.num_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  from concurrent.futures import ThreadPoolExecutor, as_completed
48
 
49
  def fetch_page(link):
50
  try:
51
- time.sleep(random.uniform(0.2, 0.5))
52
- page_response = session.get(link, timeout=5)
 
 
 
 
 
 
 
 
53
  page_soup = BeautifulSoup(page_response.text, 'lxml')
54
-
55
- [tag.decompose() for tag in page_soup(['script', 'style', 'header', 'footer', 'nav'])]
56
-
 
 
 
57
  text = ' '.join(page_soup.stripped_strings)
 
 
 
 
58
  return {
59
  'link': link,
 
60
  'text': text[:self.max_chars_per_page]
61
  }
62
  except Exception as e:
 
63
  return None
64
 
65
- links = [result.find('a', class_='result__a')['href']
66
- for result in search_results
67
- if result.find('a', class_='result__a')]
68
-
69
- with ThreadPoolExecutor(max_workers=min(len(links), 4)) as executor:
70
- future_to_url = {executor.submit(fetch_page, link): link for link in links}
71
 
72
  for future in as_completed(future_to_url):
73
  result = future.result()
@@ -77,10 +257,12 @@ class WebSearch:
77
  return results
78
 
79
  except Exception as e:
 
80
  return []
81
 
82
  def search_images(self, query):
83
  images = []
 
84
 
85
  headers = {
86
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -92,45 +274,70 @@ class WebSearch:
92
  'Upgrade-Insecure-Requests': '1'
93
  }
94
 
95
- url = f"https://www.google.com/search?q={query}&tbm=isch&hl=en"
96
- response = requests.get(url, headers=headers, verify=False)
97
-
98
- soup = BeautifulSoup(response.text, 'html.parser')
99
-
100
- for img in soup.find_all('img'):
101
- src = img.get('src', '')
102
- if src.startswith('http') and self.is_image_url(src):
103
- images.append(src)
104
-
105
- for script in soup.find_all('script'):
106
- if script.string:
107
- urls = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg|jpeg|png|gif|bmp|webp))', script.string)
108
- for url in urls:
109
- if self.is_image_url(url):
110
- images.append(url)
111
-
112
- alternative_url = f"https://www.google.com/search?q={query}&source=lnms&tbm=isch"
113
- response = requests.get(alternative_url, headers=headers, verify=False)
114
- soup = BeautifulSoup(response.text, 'html.parser')
115
-
116
- for script in soup.find_all('script'):
117
- if script.string and 'AF_initDataCallback' in script.string:
118
- matches = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg|jpeg|png|gif|bmp|webp))', script.string)
119
- for url in matches:
120
- if self.is_image_url(url):
121
- images.append(url)
122
-
123
- images = [self.clean_url(url) for url in images]
 
 
 
 
124
 
 
125
  seen = set()
126
- images = [x for x in images if not (x in seen or seen.add(x))]
127
-
128
- return images[:self.max_images]
 
 
 
129
 
130
  def is_image_url(self, url):
 
131
  image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp')
132
  return any(url.lower().endswith(ext) for ext in image_extensions)
133
 
134
- def clean_url(self, url):
135
- base_url = url.split('?')[0]
136
- return base_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import urllib.parse
6
  import time
7
  import random
8
+ from urllib.parse import urlparse, parse_qs
9
 
10
  warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
11
 
12
  class WebSearch:
13
+ def __init__(self, num_results=4, max_chars_per_page=6000, max_images=10):
14
  self.num_results = num_results
15
  self.max_chars_per_page = max_chars_per_page
16
  self.reference = []
 
24
  'DNT': '1',
25
  'Connection': 'keep-alive',
26
  }
27
+ # Common domains for direct content
28
+ self.content_domains = [
29
+ "wikipedia.org", "webmd.com", "mayoclinic.org", "healthline.com", "nih.gov",
30
+ "clevelandclinic.org", "nhs.uk", "cdc.gov", "medlineplus.gov", "hopkinsmedicine.org"
31
+ ]
32
+ # Ad and tracking domains to filter out
33
+ self.blocked_domains = [
34
+ "ad.doubleclick.net", "googleadservices.com", "bing.com/aclick", "duckduckgo.com/y.js",
35
+ "amazon.com/s", "ads.google.com", "analytics", "tracker", "pixel", "adservice"
36
+ ]
37
+
38
+ def is_valid_url(self, url):
39
+ """Check if URL is valid and not an ad/tracking URL"""
40
+ if not url or len(url) < 10:
41
+ return False
42
+
43
+ try:
44
+ parsed = urlparse(url)
45
+
46
+ # Check if URL has a valid scheme and netloc
47
+ if not all([parsed.scheme, parsed.netloc]):
48
+ return False
49
+
50
+ # Filter out ad/tracking URLs
51
+ domain = parsed.netloc.lower()
52
+ path = parsed.path.lower()
53
+ query = parsed.query.lower()
54
+
55
+ # Block URLs containing ad-related indicators
56
+ for blocked in self.blocked_domains:
57
+ if blocked in domain or blocked in path:
58
+ return False
59
+
60
+ # Block URLs with ad-related query parameters
61
+ if any(param in query for param in ["ad", "click", "track", "clkid", "msclkid"]):
62
+ return False
63
+
64
+ # Extra check for redirect URLs
65
+ if "redirect" in path or "goto" in path or "go.php" in path:
66
+ return False
67
+
68
+ # Reject extremely long URLs (often tracking)
69
+ if len(url) > 500:
70
+ return False
71
+
72
+ return True
73
+
74
+ except Exception:
75
+ return False
76
+
77
+ def clean_url(self, url):
78
+ """Clean the URL by removing tracking parameters"""
79
+ try:
80
+ parsed = urlparse(url)
81
+
82
+ # List of known tracking parameters to remove
83
+ tracking_params = [
84
+ 'utm_', 'ref_', 'ref=', 'refid', 'fbclid', 'gclid', 'msclkid', 'dclid',
85
+ 'zanpid', 'icid', 'igshid', 'mc_eid', '_hsenc', 'mkt_tok', 'yclid'
86
+ ]
87
+
88
+ # Parse query parameters
89
+ query_params = parse_qs(parsed.query)
90
+
91
+ # Remove tracking parameters
92
+ filtered_params = {
93
+ k: v for k, v in query_params.items()
94
+ if not any(tracker in k.lower() for tracker in tracking_params)
95
+ }
96
+
97
+ # Rebuild query string
98
+ clean_query = urllib.parse.urlencode(filtered_params, doseq=True) if filtered_params else ""
99
+
100
+ # Reconstruct URL
101
+ clean_url = urllib.parse.urlunparse((
102
+ parsed.scheme,
103
+ parsed.netloc,
104
+ parsed.path,
105
+ parsed.params,
106
+ clean_query,
107
+ "" # Remove fragment
108
+ ))
109
+
110
+ return clean_url
111
+
112
+ except Exception:
113
+ # If any error occurs, return the original URL
114
+ return url
115
+
116
+ def extract_real_url_from_redirect(self, url):
117
+ """Extract the actual URL from a redirect URL"""
118
+ try:
119
+ parsed = urlparse(url)
120
+
121
+ # Handle DuckDuckGo redirects
122
+ if "duckduckgo.com" in parsed.netloc and "u3=" in parsed.query:
123
+ params = parse_qs(parsed.query)
124
+ if "u3" in params and params["u3"]:
125
+ redirect_url = params["u3"][0]
126
+ # Handle nested redirects (like Bing redirects inside DuckDuckGo)
127
+ if "bing.com/aclick" in redirect_url:
128
+ bing_parsed = urlparse(redirect_url)
129
+ bing_params = parse_qs(bing_parsed.query)
130
+ if "u" in bing_params and bing_params["u"]:
131
+ decoded_url = urllib.parse.unquote(bing_params["u"][0])
132
+ return self.clean_url(decoded_url)
133
+ return self.clean_url(redirect_url)
134
+
135
+ # Handle Bing redirects
136
+ if "bing.com/aclick" in url:
137
+ params = parse_qs(parsed.query)
138
+ if "u" in params and params["u"]:
139
+ return self.clean_url(urllib.parse.unquote(params["u"][0]))
140
+
141
+ return url
142
+
143
+ except Exception:
144
+ return url
145
 
146
  def extract_text_from_webpage(self, html_content):
147
  soup = BeautifulSoup(html_content, "html.parser")
148
+
149
+ # Remove non-content elements
150
+ for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg",
151
+ "aside", "iframe", "noscript", "img", "figure", "button"]):
152
  tag.extract()
153
+
154
+ # Extract text and normalize spacing
155
+ text = ' '.join(soup.stripped_strings)
156
+ text = re.sub(r'\s+', ' ', text).strip()
157
+
158
+ return text
159
 
160
  def search(self, query):
161
  results = []
 
168
 
169
  response = session.get(url, timeout=10)
170
  soup = BeautifulSoup(response.text, 'html.parser')
171
+
172
+ # Getting more results than needed to account for filtering
173
+ search_results = soup.find_all('div', class_='result')[:self.num_results * 2]
174
+ links = []
175
+
176
+ # Extract and process links
177
+ for result in search_results:
178
+ link_tag = result.find('a', class_='result__a')
179
+ if not link_tag or not link_tag.get('href'):
180
+ continue
181
+
182
+ original_link = link_tag['href']
183
+
184
+ # Process link to get the actual URL
185
+ clean_link = self.extract_real_url_from_redirect(original_link)
186
+
187
+ # Validate the URL
188
+ if self.is_valid_url(clean_link):
189
+ links.append(clean_link)
190
+
191
+ # Prioritize content domains
192
+ prioritized_links = []
193
+ other_links = []
194
+
195
+ for link in links:
196
+ if any(domain in link for domain in self.content_domains):
197
+ prioritized_links.append(link)
198
+ else:
199
+ other_links.append(link)
200
+
201
+ # Combine prioritized links first, then others
202
+ final_links = prioritized_links + other_links
203
+
204
+ # Limit to unique links up to num_results
205
+ unique_links = []
206
+ seen_domains = set()
207
+
208
+ for link in final_links:
209
+ domain = urlparse(link).netloc
210
+ if domain not in seen_domains and len(unique_links) < self.num_results:
211
+ unique_links.append(link)
212
+ seen_domains.add(domain)
213
 
214
  from concurrent.futures import ThreadPoolExecutor, as_completed
215
 
216
  def fetch_page(link):
217
  try:
218
+ # Random delay to avoid being blocked
219
+ time.sleep(random.uniform(0.5, 1.5))
220
+
221
+ # Set a longer timeout for reliable fetching
222
+ page_response = session.get(link, timeout=10, verify=False)
223
+
224
+ # Only process HTML content
225
+ if 'text/html' not in page_response.headers.get('Content-Type', ''):
226
+ return None
227
+
228
  page_soup = BeautifulSoup(page_response.text, 'lxml')
229
+
230
+ # Remove non-content elements
231
+ [tag.decompose() for tag in page_soup(['script', 'style', 'header', 'footer',
232
+ 'nav', 'form', 'iframe', 'noscript'])]
233
+
234
+ # Extract text with better formatting
235
  text = ' '.join(page_soup.stripped_strings)
236
+ text = re.sub(r'\s+', ' ', text).strip()
237
+
238
+ title = page_soup.title.string if page_soup.title else "Untitled Page"
239
+
240
  return {
241
  'link': link,
242
+ 'title': title,
243
  'text': text[:self.max_chars_per_page]
244
  }
245
  except Exception as e:
246
+ print(f"Error fetching {link}: {str(e)}")
247
  return None
248
 
249
+ with ThreadPoolExecutor(max_workers=min(len(unique_links), 4)) as executor:
250
+ future_to_url = {executor.submit(fetch_page, link): link for link in unique_links}
 
 
 
 
251
 
252
  for future in as_completed(future_to_url):
253
  result = future.result()
 
257
  return results
258
 
259
  except Exception as e:
260
+ print(f"Search error: {str(e)}")
261
  return []
262
 
263
  def search_images(self, query):
264
  images = []
265
+ encoded_query = urllib.parse.quote(query)
266
 
267
  headers = {
268
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 
274
  'Upgrade-Insecure-Requests': '1'
275
  }
276
 
277
+ # Try multiple sources for better results
278
+ image_sources = [
279
+ f"https://www.google.com/search?q={encoded_query}&tbm=isch&hl=en",
280
+ f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1",
281
+ f"https://duckduckgo.com/?q={encoded_query}&iar=images&iax=images&ia=images"
282
+ ]
283
+
284
+ for source_url in image_sources:
285
+ try:
286
+ time.sleep(random.uniform(0.5, 1.0)) # Polite delay
287
+ response = requests.get(source_url, headers=headers, verify=False, timeout=10)
288
+ soup = BeautifulSoup(response.text, 'html.parser')
289
+
290
+ # Extract image URLs from img tags
291
+ for img in soup.find_all('img'):
292
+ src = img.get('src', '')
293
+ if src and src.startswith('http') and self.is_image_url(src):
294
+ cleaned_url = self.clean_url(src)
295
+ if self.is_valid_image(cleaned_url):
296
+ images.append(cleaned_url)
297
+
298
+ # Extract image URLs from scripts (useful for Google Images)
299
+ for script in soup.find_all('script'):
300
+ if script.string:
301
+ urls = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg|jpeg|png|gif|bmp|webp))', script.string)
302
+ for url in urls:
303
+ cleaned_url = self.clean_url(url)
304
+ if self.is_valid_image(cleaned_url):
305
+ images.append(cleaned_url)
306
+
307
+ except Exception as e:
308
+ print(f"Error searching images at {source_url}: {str(e)}")
309
+ continue
310
 
311
+ # Remove duplicates while preserving order
312
  seen = set()
313
+ unique_images = [x for x in images if not (x in seen or seen.add(x))]
314
+
315
+ # Filter out small images and suspicious URLs
316
+ filtered_images = [img for img in unique_images if self.is_valid_image(img)]
317
+
318
+ return filtered_images[:self.max_images]
319
 
320
  def is_image_url(self, url):
321
+ """Check if URL points to an image file"""
322
  image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp')
323
  return any(url.lower().endswith(ext) for ext in image_extensions)
324
 
325
+ def is_valid_image(self, url):
326
+ """Additional validation for image URLs"""
327
+ try:
328
+ # Reject tiny images (often icons) and tracking pixels
329
+ if re.search(r'(?:icon|pixel|tracker|thumb|logo|button)\d*\.(?:jpg|png|gif)', url.lower()):
330
+ return False
331
+
332
+ # Avoid suspicious domains for images
333
+ parsed = urlparse(url)
334
+ if any(bad in parsed.netloc.lower() for bad in ["tracker", "pixel", "counter", "ad."]):
335
+ return False
336
+
337
+ # Avoid very short URLs (likely not valid images)
338
+ if len(url) < 30:
339
+ return False
340
+
341
+ return True
342
+ except:
343
+ return False