Azzan Dwi Riski commited on
Commit
c0af825
·
1 Parent(s): bf89d59

update the code to handle ads and cloudflare challenge fixed

Browse files
Files changed (1) hide show
  1. app.py +105 -39
app.py CHANGED
@@ -24,10 +24,15 @@ import traceback
24
  # CONFIGURATION
25
  # =============================================
26
 
27
- BLOCK_PATTERNS = ["doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent"]
28
- PAGE_TIMEOUT = 60000 # 60 seconds
29
- WAIT_FOR_LOAD_TIMEOUT = 10000 # 10 seconds extra wait after load
 
 
 
 
30
  CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
 
31
 
32
  # =============================================
33
  # HELPER FUNCTIONS
@@ -51,8 +56,17 @@ def block_ads_and_cookies(page):
51
 
52
  def wait_for_page_stable(page):
53
  try:
54
- page.wait_for_load_state('networkidle', timeout=PAGE_TIMEOUT)
55
- time.sleep(WAIT_FOR_LOAD_TIMEOUT / 1000) # extra wait
 
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
  print(f"⚠️ Page not fully stable: {e}")
58
 
@@ -220,7 +234,57 @@ def clean_text(text):
220
  return "" # empty return to use image-only
221
  return text
222
 
223
- # Fungsi untuk mengambil screenshot viewport
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  def take_screenshot(url):
225
  url = ensure_http(url)
226
  filename = sanitize_filename(url) + '.png'
@@ -228,52 +292,54 @@ def take_screenshot(url):
228
 
229
  try:
230
  print(f"\n=== [START SCREENSHOT] URL: {url} ===")
 
231
  with sync_playwright() as p:
232
- print("Launching Playwright Chromium...")
233
- browser = p.chromium.launch(
234
- args=['--disable-features=IsolateOrigins,site-per-process'] # Disable site isolation
235
- )
236
 
237
- context = browser.new_context(
238
- viewport={"width": 1280, "height": 800},
239
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
240
- )
241
 
242
- page = context.new_page()
243
- page.set_default_timeout(PAGE_TIMEOUT)
244
- page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
245
-
246
- # Block ads and tracking
247
- print("Setting up ad and tracking blockers...")
248
- block_ads_and_cookies(page)
249
-
250
  try:
251
- print("Navigating to URL...")
252
- page.goto(url, wait_until="domcontentloaded", timeout=PAGE_TIMEOUT)
 
 
 
 
253
 
254
- # Handle potential Cloudflare protection
255
- detect_and_bypass_cloudflare(page)
 
 
256
 
257
- # Wait for page to be stable
258
  wait_for_page_stable(page)
259
 
260
- print("Taking screenshot (viewport only)...")
 
261
  page.screenshot(path=filepath)
262
 
263
  except Exception as nav_error:
264
  print(f"Navigation error: {nav_error}")
265
- # Try to take screenshot anyway if page partially loaded
266
- if page.url != "about:blank":
267
- page.screenshot(path=filepath)
268
- else:
 
 
 
 
269
  raise nav_error
270
-
271
- context.close()
272
- browser.close()
273
- print(f"Screenshot saved to {filepath}")
274
-
275
- print(f"=== [END SCREENSHOT] ===\n")
276
- return filepath
 
277
 
278
  except Exception as e:
279
  print(f"[ERROR] Failed to take screenshot for URL: {url}")
 
24
  # CONFIGURATION
25
  # =============================================
26
 
27
+ BLOCK_PATTERNS = [
28
+ "doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent",
29
+ "analytics", "tracker", "tracking", "stats", "metric", "telemetry", "social", "facebook",
30
+ "twitter", "linkedin", "pinterest", "popup", "notification", "banner"
31
+ ]
32
+ PAGE_TIMEOUT = 30000 # reduced to 30 seconds
33
+ WAIT_FOR_LOAD_TIMEOUT = 5000 # reduced to 5 seconds
34
  CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
35
+ MAX_REDIRECTS = 5 # Maximum number of redirects to follow
36
 
37
  # =============================================
38
  # HELPER FUNCTIONS
 
56
 
57
  def wait_for_page_stable(page):
58
  try:
59
+ # First wait for DOM content
60
+ page.wait_for_load_state('domcontentloaded', timeout=PAGE_TIMEOUT)
61
+
62
+ # Then wait for network to be idle
63
+ try:
64
+ page.wait_for_load_state('networkidle', timeout=WAIT_FOR_LOAD_TIMEOUT)
65
+ except:
66
+ print("Network not fully idle, continuing anyway...")
67
+
68
+ # Small additional wait
69
+ time.sleep(2)
70
  except Exception as e:
71
  print(f"⚠️ Page not fully stable: {e}")
72
 
 
234
  return "" # empty return to use image-only
235
  return text
236
 
237
+ def create_browser_context(playwright):
238
+ return playwright.chromium.launch(
239
+ args=[
240
+ '--disable-features=IsolateOrigins,site-per-process',
241
+ '--disable-web-security',
242
+ '--disable-site-isolation-trials',
243
+ '--disable-setuid-sandbox',
244
+ '--no-sandbox',
245
+ '--disable-gpu',
246
+ '--disable-dev-shm-usage'
247
+ ]
248
+ ).new_context(
249
+ viewport={"width": 1280, "height": 800},
250
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
251
+ ignore_https_errors=True,
252
+ java_script_enabled=True,
253
+ bypass_csp=True,
254
+ extra_http_headers={
255
+ "Accept-Language": "en-US,en;q=0.9",
256
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
257
+ "Connection": "keep-alive",
258
+ "DNT": "1"
259
+ }
260
+ )
261
+
262
+ def setup_request_interception(page):
263
+ redirect_count = 0
264
+
265
+ def handle_request(route):
266
+ nonlocal redirect_count
267
+ request = route.request
268
+
269
+ # Block known ad/tracking patterns
270
+ if any(pattern in request.url.lower() for pattern in BLOCK_PATTERNS):
271
+ print(f"Blocking request to: {request.url}")
272
+ route.abort()
273
+ return
274
+
275
+ # Handle redirects
276
+ if request.redirect_chain:
277
+ redirect_count += 1
278
+ if redirect_count > MAX_REDIRECTS:
279
+ print(f"Too many redirects ({redirect_count}), aborting request")
280
+ route.abort()
281
+ return
282
+
283
+ # Continue with the request
284
+ route.continue_()
285
+
286
+ page.route("**/*", handle_request)
287
+
288
  def take_screenshot(url):
289
  url = ensure_http(url)
290
  filename = sanitize_filename(url) + '.png'
 
292
 
293
  try:
294
  print(f"\n=== [START SCREENSHOT] URL: {url} ===")
295
+
296
  with sync_playwright() as p:
297
+ print("Launching browser with custom configuration...")
298
+ context = create_browser_context(p)
299
+ page = context.new_page()
 
300
 
301
+ print("Setting up request interception...")
302
+ setup_request_interception(page)
 
 
303
 
 
 
 
 
 
 
 
 
304
  try:
305
+ print("Attempting to navigate to URL...")
306
+ response = page.goto(
307
+ url,
308
+ wait_until="commit", # Changed to commit instead of domcontentloaded
309
+ timeout=PAGE_TIMEOUT
310
+ )
311
 
312
+ if not response:
313
+ print("No response received, attempting to continue...")
314
+ elif response.status >= 400:
315
+ print(f"Received error status code: {response.status}")
316
 
317
+ # Try to wait for the page to be more stable
318
  wait_for_page_stable(page)
319
 
320
+ # Take screenshot even if page might not be fully loaded
321
+ print("Taking screenshot...")
322
  page.screenshot(path=filepath)
323
 
324
  except Exception as nav_error:
325
  print(f"Navigation error: {nav_error}")
326
+ # Try to take screenshot anyway if we have any content
327
+ try:
328
+ if page.url != "about:blank":
329
+ print("Taking screenshot of partial page...")
330
+ page.screenshot(path=filepath)
331
+ else:
332
+ raise nav_error
333
+ except:
334
  raise nav_error
335
+ finally:
336
+ context.close()
337
+
338
+ if os.path.exists(filepath):
339
+ print(f"Screenshot saved successfully to {filepath}")
340
+ return filepath
341
+ else:
342
+ raise Exception("Screenshot file was not created")
343
 
344
  except Exception as e:
345
  print(f"[ERROR] Failed to take screenshot for URL: {url}")