Spaces:

MuhammadNoman7600
/

derm-ai

Sleeping

App Files Files Community

muhammadnoman76 commited on May 4

Commit

b3b4203

1 Parent(s): 958ee21

update

Browse files

Files changed (1) hide show

app/services/websearch.py +258 -51

app/services/websearch.py CHANGED Viewed

@@ -5,11 +5,12 @@ from bs4 import BeautifulSoup
 import urllib.parse
 import time
 import random
 warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
 class WebSearch:
-    def __init__(self, num_results=4, max_chars_per_page=6000 , max_images=10):
         self.num_results = num_results
         self.max_chars_per_page = max_chars_per_page
         self.reference = []
@@ -23,13 +24,138 @@ class WebSearch:
             'DNT': '1',
             'Connection': 'keep-alive',
         }
     def extract_text_from_webpage(self, html_content):
         soup = BeautifulSoup(html_content, "html.parser")
-        for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg"]):
             tag.extract()
-        visible_text = soup.get_text(strip=True)
-        return visible_text
     def search(self, query):
         results = []
@@ -42,32 +168,86 @@ class WebSearch:
                 response = session.get(url, timeout=10)
                 soup = BeautifulSoup(response.text, 'html.parser')
-                search_results = soup.find_all('div', class_='result')[:self.num_results]
                 from concurrent.futures import ThreadPoolExecutor, as_completed
                 def fetch_page(link):
                     try:
-                        time.sleep(random.uniform(0.2, 0.5))
-                        page_response = session.get(link, timeout=5)
                         page_soup = BeautifulSoup(page_response.text, 'lxml')
-                        [tag.decompose() for tag in page_soup(['script', 'style', 'header', 'footer', 'nav'])]
                         text = ' '.join(page_soup.stripped_strings)
                         return {
                             'link': link,
                             'text': text[:self.max_chars_per_page]
                         }
                     except Exception as e:
                         return None
-                links = [result.find('a', class_='result__a')['href']
-                         for result in search_results
-                         if result.find('a', class_='result__a')]
-                with ThreadPoolExecutor(max_workers=min(len(links), 4)) as executor:
-                    future_to_url = {executor.submit(fetch_page, link): link for link in links}
                     for future in as_completed(future_to_url):
                         result = future.result()
@@ -77,10 +257,12 @@ class WebSearch:
                 return results
         except Exception as e:
             return []
     def search_images(self, query):
         images = []
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -92,45 +274,70 @@ class WebSearch:
             'Upgrade-Insecure-Requests': '1'
         }
-        url = f"https://www.google.com/search?q={query}&tbm=isch&hl=en"
-        response = requests.get(url, headers=headers, verify=False)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for img in soup.find_all('img'):
-            src = img.get('src', '')
-            if src.startswith('http') and self.is_image_url(src):
-                images.append(src)
-        for script in soup.find_all('script'):
-            if script.string:
-                urls = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg|jpeg|png|gif|bmp|webp))', script.string)
-                for url in urls:
-                    if self.is_image_url(url):
-                        images.append(url)
-        alternative_url = f"https://www.google.com/search?q={query}&source=lnms&tbm=isch"
-        response = requests.get(alternative_url, headers=headers, verify=False)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for script in soup.find_all('script'):
-            if script.string and 'AF_initDataCallback' in script.string:
-                matches = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg|jpeg|png|gif|bmp|webp))', script.string)
-                for url in matches:
-                    if self.is_image_url(url):
-                        images.append(url)
-        images = [self.clean_url(url) for url in images]
         seen = set()
-        images = [x for x in images if not (x in seen or seen.add(x))]
-        return images[:self.max_images]
     def is_image_url(self, url):
         image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp')
         return any(url.lower().endswith(ext) for ext in image_extensions)
-    def clean_url(self, url):
-        base_url = url.split('?')[0]
-        return base_url

 import urllib.parse
 import time
 import random
+from urllib.parse import urlparse, parse_qs
 warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
 class WebSearch:
+    def __init__(self, num_results=4, max_chars_per_page=6000, max_images=10):
         self.num_results = num_results
         self.max_chars_per_page = max_chars_per_page
         self.reference = []
             'DNT': '1',
             'Connection': 'keep-alive',
         }
+        # Common domains for direct content
+        self.content_domains = [
+            "wikipedia.org", "webmd.com", "mayoclinic.org", "healthline.com", "nih.gov",
+            "clevelandclinic.org", "nhs.uk", "cdc.gov", "medlineplus.gov", "hopkinsmedicine.org"
+        ]
+        # Ad and tracking domains to filter out
+        self.blocked_domains = [
+            "ad.doubleclick.net", "googleadservices.com", "bing.com/aclick", "duckduckgo.com/y.js",
+            "amazon.com/s", "ads.google.com", "analytics", "tracker", "pixel", "adservice"
+        ]
+    def is_valid_url(self, url):
+        """Check if URL is valid and not an ad/tracking URL"""
+        if not url or len(url) < 10:
+            return False
+        try:
+            parsed = urlparse(url)
+            # Check if URL has a valid scheme and netloc
+            if not all([parsed.scheme, parsed.netloc]):
+                return False
+            # Filter out ad/tracking URLs
+            domain = parsed.netloc.lower()
+            path = parsed.path.lower()
+            query = parsed.query.lower()
+            # Block URLs containing ad-related indicators
+            for blocked in self.blocked_domains:
+                if blocked in domain or blocked in path:
+                    return False
+            # Block URLs with ad-related query parameters
+            if any(param in query for param in ["ad", "click", "track", "clkid", "msclkid"]):
+                return False
+            # Extra check for redirect URLs
+            if "redirect" in path or "goto" in path or "go.php" in path:
+                return False
+            # Reject extremely long URLs (often tracking)
+            if len(url) > 500:
+                return False
+            return True
+        except Exception:
+            return False
+    def clean_url(self, url):
+        """Clean the URL by removing tracking parameters"""
+        try:
+            parsed = urlparse(url)
+            # List of known tracking parameters to remove
+            tracking_params = [
+                'utm_', 'ref_', 'ref=', 'refid', 'fbclid', 'gclid', 'msclkid', 'dclid',
+                'zanpid', 'icid', 'igshid', 'mc_eid', '_hsenc', 'mkt_tok', 'yclid'
+            ]
+            # Parse query parameters
+            query_params = parse_qs(parsed.query)
+            # Remove tracking parameters
+            filtered_params = {
+                k: v for k, v in query_params.items()
+                if not any(tracker in k.lower() for tracker in tracking_params)
+            }
+            # Rebuild query string
+            clean_query = urllib.parse.urlencode(filtered_params, doseq=True) if filtered_params else ""
+            # Reconstruct URL
+            clean_url = urllib.parse.urlunparse((
+                parsed.scheme,
+                parsed.netloc,
+                parsed.path,
+                parsed.params,
+                clean_query,
+                ""  # Remove fragment
+            ))
+            return clean_url
+        except Exception:
+            # If any error occurs, return the original URL
+            return url
+    def extract_real_url_from_redirect(self, url):
+        """Extract the actual URL from a redirect URL"""
+        try:
+            parsed = urlparse(url)
+            # Handle DuckDuckGo redirects
+            if "duckduckgo.com" in parsed.netloc and "u3=" in parsed.query:
+                params = parse_qs(parsed.query)
+                if "u3" in params and params["u3"]:
+                    redirect_url = params["u3"][0]
+                    # Handle nested redirects (like Bing redirects inside DuckDuckGo)
+                    if "bing.com/aclick" in redirect_url:
+                        bing_parsed = urlparse(redirect_url)
+                        bing_params = parse_qs(bing_parsed.query)
+                        if "u" in bing_params and bing_params["u"]:
+                            decoded_url = urllib.parse.unquote(bing_params["u"][0])
+                            return self.clean_url(decoded_url)
+                    return self.clean_url(redirect_url)
+            # Handle Bing redirects
+            if "bing.com/aclick" in url:
+                params = parse_qs(parsed.query)
+                if "u" in params and params["u"]:
+                    return self.clean_url(urllib.parse.unquote(params["u"][0]))
+            return url
+        except Exception:
+            return url
     def extract_text_from_webpage(self, html_content):
         soup = BeautifulSoup(html_content, "html.parser")
+        # Remove non-content elements
+        for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg",
+                         "aside", "iframe", "noscript", "img", "figure", "button"]):
             tag.extract()
+        # Extract text and normalize spacing
+        text = ' '.join(soup.stripped_strings)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
     def search(self, query):
         results = []
                 response = session.get(url, timeout=10)
                 soup = BeautifulSoup(response.text, 'html.parser')
+                # Getting more results than needed to account for filtering
+                search_results = soup.find_all('div', class_='result')[:self.num_results * 2]
+                links = []
+                # Extract and process links
+                for result in search_results:
+                    link_tag = result.find('a', class_='result__a')
+                    if not link_tag or not link_tag.get('href'):
+                        continue
+                    original_link = link_tag['href']
+                    # Process link to get the actual URL
+                    clean_link = self.extract_real_url_from_redirect(original_link)
+                    # Validate the URL
+                    if self.is_valid_url(clean_link):
+                        links.append(clean_link)
+                # Prioritize content domains
+                prioritized_links = []
+                other_links = []
+                for link in links:
+                    if any(domain in link for domain in self.content_domains):
+                        prioritized_links.append(link)
+                    else:
+                        other_links.append(link)
+                # Combine prioritized links first, then others
+                final_links = prioritized_links + other_links
+                # Limit to unique links up to num_results
+                unique_links = []
+                seen_domains = set()
+                for link in final_links:
+                    domain = urlparse(link).netloc
+                    if domain not in seen_domains and len(unique_links) < self.num_results:
+                        unique_links.append(link)
+                        seen_domains.add(domain)
                 from concurrent.futures import ThreadPoolExecutor, as_completed
                 def fetch_page(link):
                     try:
+                        # Random delay to avoid being blocked
+                        time.sleep(random.uniform(0.5, 1.5))
+                        # Set a longer timeout for reliable fetching
+                        page_response = session.get(link, timeout=10, verify=False)
+                        # Only process HTML content
+                        if 'text/html' not in page_response.headers.get('Content-Type', ''):
+                            return None
                         page_soup = BeautifulSoup(page_response.text, 'lxml')
+                        # Remove non-content elements
+                        [tag.decompose() for tag in page_soup(['script', 'style', 'header', 'footer',
+                                                               'nav', 'form', 'iframe', 'noscript'])]
+                        # Extract text with better formatting
                         text = ' '.join(page_soup.stripped_strings)
+                        text = re.sub(r'\s+', ' ', text).strip()
+                        title = page_soup.title.string if page_soup.title else "Untitled Page"
                         return {
                             'link': link,
+                            'title': title,
                             'text': text[:self.max_chars_per_page]
                         }
                     except Exception as e:
+                        print(f"Error fetching {link}: {str(e)}")
                         return None
+                with ThreadPoolExecutor(max_workers=min(len(unique_links), 4)) as executor:
+                    future_to_url = {executor.submit(fetch_page, link): link for link in unique_links}
                     for future in as_completed(future_to_url):
                         result = future.result()
                 return results
         except Exception as e:
+            print(f"Search error: {str(e)}")
             return []
     def search_images(self, query):
         images = []
+        encoded_query = urllib.parse.quote(query)
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Upgrade-Insecure-Requests': '1'
         }
+        # Try multiple sources for better results
+        image_sources = [
+            f"https://www.google.com/search?q={encoded_query}&tbm=isch&hl=en",
+            f"https://www.bing.com/images/search?q={encoded_query}&form=HDRSC2&first=1",
+            f"https://duckduckgo.com/?q={encoded_query}&iar=images&iax=images&ia=images"
+        ]
+        for source_url in image_sources:
+            try:
+                time.sleep(random.uniform(0.5, 1.0))  # Polite delay
+                response = requests.get(source_url, headers=headers, verify=False, timeout=10)
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Extract image URLs from img tags
+                for img in soup.find_all('img'):
+                    src = img.get('src', '')
+                    if src and src.startswith('http') and self.is_image_url(src):
+                        cleaned_url = self.clean_url(src)
+                        if self.is_valid_image(cleaned_url):
+                            images.append(cleaned_url)
+                # Extract image URLs from scripts (useful for Google Images)
+                for script in soup.find_all('script'):
+                    if script.string:
+                        urls = re.findall(r'https?://[^\s<>"\']+?(?:\.(?:jpg|jpeg|png|gif|bmp|webp))', script.string)
+                        for url in urls:
+                            cleaned_url = self.clean_url(url)
+                            if self.is_valid_image(cleaned_url):
+                                images.append(cleaned_url)
+            except Exception as e:
+                print(f"Error searching images at {source_url}: {str(e)}")
+                continue
+        # Remove duplicates while preserving order
         seen = set()
+        unique_images = [x for x in images if not (x in seen or seen.add(x))]
+        # Filter out small images and suspicious URLs
+        filtered_images = [img for img in unique_images if self.is_valid_image(img)]
+        return filtered_images[:self.max_images]
     def is_image_url(self, url):
+        """Check if URL points to an image file"""
         image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp')
         return any(url.lower().endswith(ext) for ext in image_extensions)
+    def is_valid_image(self, url):
+        """Additional validation for image URLs"""
+        try:
+            # Reject tiny images (often icons) and tracking pixels
+            if re.search(r'(?:icon|pixel|tracker|thumb|logo|button)\d*\.(?:jpg|png|gif)', url.lower()):
+                return False
+            # Avoid suspicious domains for images
+            parsed = urlparse(url)
+            if any(bad in parsed.netloc.lower() for bad in ["tracker", "pixel", "counter", "ad."]):
+                return False
+            # Avoid very short URLs (likely not valid images)
+            if len(url) < 30:
+                return False
+            return True
+        except:
+            return False