Azzan Dwi Riski commited on
Commit
0d1f775
·
1 Parent(s): 7646a9b

update the code to handle ads and cloudflare challenge fixed3

Browse files
Files changed (1) hide show
  1. app.py +123 -59
app.py CHANGED
@@ -243,7 +243,16 @@ def create_browser_context(playwright):
243
  '--disable-setuid-sandbox',
244
  '--no-sandbox',
245
  '--disable-gpu',
246
- '--disable-dev-shm-usage'
 
 
 
 
 
 
 
 
 
247
  ]
248
  ).new_context(
249
  viewport={"width": 1280, "height": 800},
@@ -255,7 +264,8 @@ def create_browser_context(playwright):
255
  "Accept-Language": "en-US,en;q=0.9",
256
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
257
  "Connection": "keep-alive",
258
- "DNT": "1"
 
259
  }
260
  )
261
 
@@ -292,75 +302,129 @@ def setup_request_interception(page):
292
  page.on("response", handle_response)
293
  page.route("**/*", handle_request)
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  def take_screenshot(url):
296
  url = ensure_http(url)
297
  filename = sanitize_filename(url) + '.png'
298
  filepath = os.path.join(SCREENSHOT_DIR, filename)
299
 
300
- try:
301
- print(f"\n=== [START SCREENSHOT] URL: {url} ===")
302
-
303
- with sync_playwright() as p:
304
- print("Launching browser with custom configuration...")
305
- context = create_browser_context(p)
306
- page = context.new_page()
307
-
308
- print("Setting up request interception...")
309
- setup_request_interception(page)
310
 
311
- try:
312
- print("Attempting to navigate to URL...")
313
- response = page.goto(
314
- url,
315
- wait_until="commit",
316
- timeout=PAGE_TIMEOUT
317
- )
318
 
319
- if not response:
320
- print("No response received, attempting to continue...")
321
- elif response.status >= 300 and response.status <= 399:
322
- print(f"Received redirect status code: {response.status}")
323
- elif response.status >= 400:
324
- print(f"Received error status code: {response.status}")
 
 
 
 
325
 
326
- # Try to wait for the page to be more stable
327
  try:
328
- wait_for_page_stable(page)
329
- except Exception as e:
330
- print(f"Page stability warning: {e}")
331
-
332
- # Take screenshot even if page might not be fully loaded
333
- print("Taking screenshot...")
334
- page.screenshot(path=filepath)
335
-
336
- except Exception as nav_error:
337
- print(f"Navigation error: {nav_error}")
338
- # Try to take screenshot anyway if we have any content
339
- try:
340
- if page.url != "about:blank":
341
- print("Taking screenshot of partial page...")
342
- page.screenshot(path=filepath)
343
  else:
344
- raise nav_error
345
- except:
346
- raise nav_error
347
- finally:
348
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  context.close()
350
- except Exception as close_error:
351
- print(f"Warning: Error while closing context: {close_error}")
352
-
353
- if os.path.exists(filepath):
354
- print(f"Screenshot saved successfully to {filepath}")
355
- return filepath
356
- else:
357
- raise Exception("Screenshot file was not created")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
- except Exception as e:
360
- print(f"[ERROR] Failed to take screenshot for URL: {url}")
361
- print(f"Exception: {e}")
362
- traceback.print_exc()
363
- return None
 
 
 
 
 
 
 
364
 
365
  def resize_if_needed(image_path, max_mb=1, target_width=720):
366
  file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
 
243
  '--disable-setuid-sandbox',
244
  '--no-sandbox',
245
  '--disable-gpu',
246
+ '--disable-dev-shm-usage',
247
+ '--disable-extensions',
248
+ '--disable-plugins',
249
+ '--disable-background-timer-throttling',
250
+ '--disable-backgrounding-occluded-windows',
251
+ '--disable-renderer-backgrounding',
252
+ '--no-first-run',
253
+ '--no-default-browser-check',
254
+ '--disable-translate',
255
+ '--disable-ipc-flooding-protection'
256
  ]
257
  ).new_context(
258
  viewport={"width": 1280, "height": 800},
 
264
  "Accept-Language": "en-US,en;q=0.9",
265
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
266
  "Connection": "keep-alive",
267
+ "DNT": "1",
268
+ "Cache-Control": "no-cache"
269
  }
270
  )
271
 
 
302
  page.on("response", handle_response)
303
  page.route("**/*", handle_request)
304
 
305
+ def try_navigation_strategies(page, url):
306
+ strategies = [
307
+ {"wait_until": "commit", "timeout": 15000},
308
+ {"wait_until": "domcontentloaded", "timeout": 10000},
309
+ {"wait_until": "load", "timeout": 20000},
310
+ {"wait_until": "networkidle", "timeout": 30000}
311
+ ]
312
+
313
+ for i, strategy in enumerate(strategies):
314
+ try:
315
+ print(f"Trying navigation strategy {i+1}: {strategy}")
316
+ response = page.goto(url, **strategy)
317
+ print(f"Navigation successful with strategy {i+1}")
318
+ return response
319
+ except Exception as e:
320
+ print(f"Strategy {i+1} failed: {e}")
321
+ if "ERR_TOO_MANY_REDIRECTS" in str(e):
322
+ print(f"Redirect error detected, trying next strategy...")
323
+ continue
324
+ elif i == len(strategies) - 1: # Last strategy
325
+ raise e
326
+ continue
327
+
328
+ raise Exception("All navigation strategies failed")
329
+
330
  def take_screenshot(url):
331
  url = ensure_http(url)
332
  filename = sanitize_filename(url) + '.png'
333
  filepath = os.path.join(SCREENSHOT_DIR, filename)
334
 
335
+ max_retries = 3
336
+
337
+ for attempt in range(max_retries):
338
+ try:
339
+ print(f"\n=== [SCREENSHOT ATTEMPT {attempt + 1}/{max_retries}] URL: {url} ===")
 
 
 
 
 
340
 
341
+ with sync_playwright() as p:
342
+ print("Launching browser with aggressive configuration...")
343
+ context = create_browser_context(p)
344
+ page = context.new_page()
 
 
 
345
 
346
+ # Only set up basic request blocking for this attempt
347
+ if attempt == 0:
348
+ print("Setting up basic request interception...")
349
+ def simple_block(route):
350
+ url_lower = route.request.url.lower()
351
+ if any(pattern in url_lower for pattern in BLOCK_PATTERNS):
352
+ route.abort()
353
+ else:
354
+ route.continue_()
355
+ page.route("**/*", simple_block)
356
 
 
357
  try:
358
+ # Try different navigation strategies
359
+ if attempt == 0:
360
+ # First attempt: aggressive but safe
361
+ response = try_navigation_strategies(page, url)
362
+ elif attempt == 1:
363
+ # Second attempt: minimal approach
364
+ print("Trying minimal navigation approach...")
365
+ response = page.goto(url, wait_until="commit", timeout=10000)
 
 
 
 
 
 
 
366
  else:
367
+ # Third attempt: just try to load anything
368
+ print("Trying basic navigation...")
369
+ response = page.goto(url, timeout=15000)
370
+
371
+ if response:
372
+ print(f"Response status: {response.status}")
373
+
374
+ # Try to wait for some content
375
+ try:
376
+ page.wait_for_timeout(3000) # Just wait 3 seconds
377
+ if attempt == 0:
378
+ wait_for_page_stable(page)
379
+ except Exception as e:
380
+ print(f"Page stability warning: {e}")
381
+
382
+ # Take screenshot
383
+ print("Taking screenshot...")
384
+ page.screenshot(path=filepath)
385
+
386
+ # If we get here, screenshot was successful
387
  context.close()
388
+ print(f"Screenshot saved successfully to {filepath}")
389
+ return filepath
390
+
391
+ except Exception as nav_error:
392
+ print(f"Navigation error on attempt {attempt + 1}: {nav_error}")
393
+
394
+ # Try to take screenshot of whatever we have
395
+ try:
396
+ if page.url != "about:blank":
397
+ print("Taking screenshot of partial page...")
398
+ page.screenshot(path=filepath)
399
+ context.close()
400
+ if os.path.exists(filepath):
401
+ print(f"Partial screenshot saved to {filepath}")
402
+ return filepath
403
+ except Exception as screenshot_error:
404
+ print(f"Failed to take partial screenshot: {screenshot_error}")
405
+
406
+ context.close()
407
+
408
+ # If this is the last attempt, raise the error
409
+ if attempt == max_retries - 1:
410
+ raise nav_error
411
+ else:
412
+ print(f"Retrying with different approach...")
413
+ time.sleep(2) # Wait before retry
414
+ continue
415
 
416
+ except Exception as e:
417
+ print(f"[ERROR] Attempt {attempt + 1} failed: {e}")
418
+ if attempt == max_retries - 1:
419
+ print(f"All {max_retries} attempts failed for URL: {url}")
420
+ traceback.print_exc()
421
+ return None
422
+ else:
423
+ print("Waiting before next attempt...")
424
+ time.sleep(3)
425
+ continue
426
+
427
+ return None
428
 
429
  def resize_if_needed(image_path, max_mb=1, target_width=720):
430
  file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB