Spaces:

azzandr
/

ID-gambling-website-detection

Running

App Files Files Community

Azzan Dwi Riski commited on May 23

Commit

c0af825

1 Parent(s): bf89d59

update the code to handle ads and cloudflare challenge fixed

Browse files

Files changed (1) hide show

app.py +105 -39

app.py CHANGED Viewed

@@ -24,10 +24,15 @@ import traceback
 # CONFIGURATION
 # =============================================
-BLOCK_PATTERNS = ["doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent"]
-PAGE_TIMEOUT = 60000  # 60 seconds
-WAIT_FOR_LOAD_TIMEOUT = 10000  # 10 seconds extra wait after load
 CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
 # =============================================
 # HELPER FUNCTIONS
@@ -51,8 +56,17 @@ def block_ads_and_cookies(page):
 def wait_for_page_stable(page):
     try:
-        page.wait_for_load_state('networkidle', timeout=PAGE_TIMEOUT)
-        time.sleep(WAIT_FOR_LOAD_TIMEOUT / 1000)  # extra wait
     except Exception as e:
         print(f"⚠️  Page not fully stable: {e}")
@@ -220,7 +234,57 @@ def clean_text(text):
         return ""  # empty return to use image-only
     return text
-# Fungsi untuk mengambil screenshot viewport
 def take_screenshot(url):
     url = ensure_http(url)
     filename = sanitize_filename(url) + '.png'
@@ -228,52 +292,54 @@ def take_screenshot(url):
     try:
         print(f"\n=== [START SCREENSHOT] URL: {url} ===")
         with sync_playwright() as p:
-            print("Launching Playwright Chromium...")
-            browser = p.chromium.launch(
-                args=['--disable-features=IsolateOrigins,site-per-process']  # Disable site isolation
-            )
-            context = browser.new_context(
-                viewport={"width": 1280, "height": 800},
-                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
-            )
-            page = context.new_page()
-            page.set_default_timeout(PAGE_TIMEOUT)
-            page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
-            # Block ads and tracking
-            print("Setting up ad and tracking blockers...")
-            block_ads_and_cookies(page)
             try:
-                print("Navigating to URL...")
-                page.goto(url, wait_until="domcontentloaded", timeout=PAGE_TIMEOUT)
-                # Handle potential Cloudflare protection
-                detect_and_bypass_cloudflare(page)
-                # Wait for page to be stable
                 wait_for_page_stable(page)
-                print("Taking screenshot (viewport only)...")
                 page.screenshot(path=filepath)
             except Exception as nav_error:
                 print(f"Navigation error: {nav_error}")
-                # Try to take screenshot anyway if page partially loaded
-                if page.url != "about:blank":
-                    page.screenshot(path=filepath)
-                else:
                     raise nav_error
-            context.close()
-            browser.close()
-            print(f"Screenshot saved to {filepath}")
-        print(f"=== [END SCREENSHOT] ===\n")
-        return filepath
     except Exception as e:
         print(f"[ERROR] Failed to take screenshot for URL: {url}")

 # CONFIGURATION
 # =============================================
+BLOCK_PATTERNS = [
+    "doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent",
+    "analytics", "tracker", "tracking", "stats", "metric", "telemetry", "social", "facebook",
+    "twitter", "linkedin", "pinterest", "popup", "notification", "banner"
+]
+PAGE_TIMEOUT = 30000  # reduced to 30 seconds
+WAIT_FOR_LOAD_TIMEOUT = 5000  # reduced to 5 seconds
 CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
+MAX_REDIRECTS = 5  # Maximum number of redirects to follow
 # =============================================
 # HELPER FUNCTIONS
 def wait_for_page_stable(page):
     try:
+        # First wait for DOM content
+        page.wait_for_load_state('domcontentloaded', timeout=PAGE_TIMEOUT)
+        # Then wait for network to be idle
+        try:
+            page.wait_for_load_state('networkidle', timeout=WAIT_FOR_LOAD_TIMEOUT)
+        except:
+            print("Network not fully idle, continuing anyway...")
+        # Small additional wait
+        time.sleep(2)
     except Exception as e:
         print(f"⚠️  Page not fully stable: {e}")
         return ""  # empty return to use image-only
     return text
+def create_browser_context(playwright):
+    return playwright.chromium.launch(
+        args=[
+            '--disable-features=IsolateOrigins,site-per-process',
+            '--disable-web-security',
+            '--disable-site-isolation-trials',
+            '--disable-setuid-sandbox',
+            '--no-sandbox',
+            '--disable-gpu',
+            '--disable-dev-shm-usage'
+        ]
+    ).new_context(
+        viewport={"width": 1280, "height": 800},
+        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
+        ignore_https_errors=True,
+        java_script_enabled=True,
+        bypass_csp=True,
+        extra_http_headers={
+            "Accept-Language": "en-US,en;q=0.9",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Connection": "keep-alive",
+            "DNT": "1"
+        }
+    )
+def setup_request_interception(page):
+    redirect_count = 0
+    def handle_request(route):
+        nonlocal redirect_count
+        request = route.request
+        # Block known ad/tracking patterns
+        if any(pattern in request.url.lower() for pattern in BLOCK_PATTERNS):
+            print(f"Blocking request to: {request.url}")
+            route.abort()
+            return
+        # Handle redirects
+        if request.redirect_chain:
+            redirect_count += 1
+            if redirect_count > MAX_REDIRECTS:
+                print(f"Too many redirects ({redirect_count}), aborting request")
+                route.abort()
+                return
+        # Continue with the request
+        route.continue_()
+    page.route("**/*", handle_request)
 def take_screenshot(url):
     url = ensure_http(url)
     filename = sanitize_filename(url) + '.png'
     try:
         print(f"\n=== [START SCREENSHOT] URL: {url} ===")
         with sync_playwright() as p:
+            print("Launching browser with custom configuration...")
+            context = create_browser_context(p)
+            page = context.new_page()
+            print("Setting up request interception...")
+            setup_request_interception(page)
             try:
+                print("Attempting to navigate to URL...")
+                response = page.goto(
+                    url,
+                    wait_until="commit",  # Changed to commit instead of domcontentloaded
+                    timeout=PAGE_TIMEOUT
+                )
+                if not response:
+                    print("No response received, attempting to continue...")
+                elif response.status >= 400:
+                    print(f"Received error status code: {response.status}")
+                # Try to wait for the page to be more stable
                 wait_for_page_stable(page)
+                # Take screenshot even if page might not be fully loaded
+                print("Taking screenshot...")
                 page.screenshot(path=filepath)
             except Exception as nav_error:
                 print(f"Navigation error: {nav_error}")
+                # Try to take screenshot anyway if we have any content
+                try:
+                    if page.url != "about:blank":
+                        print("Taking screenshot of partial page...")
+                        page.screenshot(path=filepath)
+                    else:
+                        raise nav_error
+                except:
                     raise nav_error
+            finally:
+                context.close()
+        if os.path.exists(filepath):
+            print(f"Screenshot saved successfully to {filepath}")
+            return filepath
+        else:
+            raise Exception("Screenshot file was not created")
     except Exception as e:
         print(f"[ERROR] Failed to take screenshot for URL: {url}")