Azzan Dwi Riski
commited on
Commit
·
0d1f775
1
Parent(s):
7646a9b
update the code to handle ads and cloudflare challenge fixed3
Browse files
app.py
CHANGED
@@ -243,7 +243,16 @@ def create_browser_context(playwright):
|
|
243 |
'--disable-setuid-sandbox',
|
244 |
'--no-sandbox',
|
245 |
'--disable-gpu',
|
246 |
-
'--disable-dev-shm-usage'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
]
|
248 |
).new_context(
|
249 |
viewport={"width": 1280, "height": 800},
|
@@ -255,7 +264,8 @@ def create_browser_context(playwright):
|
|
255 |
"Accept-Language": "en-US,en;q=0.9",
|
256 |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
257 |
"Connection": "keep-alive",
|
258 |
-
"DNT": "1"
|
|
|
259 |
}
|
260 |
)
|
261 |
|
@@ -292,75 +302,129 @@ def setup_request_interception(page):
|
|
292 |
page.on("response", handle_response)
|
293 |
page.route("**/*", handle_request)
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
def take_screenshot(url):
|
296 |
url = ensure_http(url)
|
297 |
filename = sanitize_filename(url) + '.png'
|
298 |
filepath = os.path.join(SCREENSHOT_DIR, filename)
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
print("
|
305 |
-
context = create_browser_context(p)
|
306 |
-
page = context.new_page()
|
307 |
-
|
308 |
-
print("Setting up request interception...")
|
309 |
-
setup_request_interception(page)
|
310 |
|
311 |
-
|
312 |
-
print("
|
313 |
-
|
314 |
-
|
315 |
-
wait_until="commit",
|
316 |
-
timeout=PAGE_TIMEOUT
|
317 |
-
)
|
318 |
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
|
|
|
|
|
|
|
|
325 |
|
326 |
-
# Try to wait for the page to be more stable
|
327 |
try:
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
except Exception as nav_error:
|
337 |
-
print(f"Navigation error: {nav_error}")
|
338 |
-
# Try to take screenshot anyway if we have any content
|
339 |
-
try:
|
340 |
-
if page.url != "about:blank":
|
341 |
-
print("Taking screenshot of partial page...")
|
342 |
-
page.screenshot(path=filepath)
|
343 |
else:
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
context.close()
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
def resize_if_needed(image_path, max_mb=1, target_width=720):
|
366 |
file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
|
|
|
243 |
'--disable-setuid-sandbox',
|
244 |
'--no-sandbox',
|
245 |
'--disable-gpu',
|
246 |
+
'--disable-dev-shm-usage',
|
247 |
+
'--disable-extensions',
|
248 |
+
'--disable-plugins',
|
249 |
+
'--disable-background-timer-throttling',
|
250 |
+
'--disable-backgrounding-occluded-windows',
|
251 |
+
'--disable-renderer-backgrounding',
|
252 |
+
'--no-first-run',
|
253 |
+
'--no-default-browser-check',
|
254 |
+
'--disable-translate',
|
255 |
+
'--disable-ipc-flooding-protection'
|
256 |
]
|
257 |
).new_context(
|
258 |
viewport={"width": 1280, "height": 800},
|
|
|
264 |
"Accept-Language": "en-US,en;q=0.9",
|
265 |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
266 |
"Connection": "keep-alive",
|
267 |
+
"DNT": "1",
|
268 |
+
"Cache-Control": "no-cache"
|
269 |
}
|
270 |
)
|
271 |
|
|
|
302 |
page.on("response", handle_response)
|
303 |
page.route("**/*", handle_request)
|
304 |
|
305 |
+
def try_navigation_strategies(page, url):
|
306 |
+
strategies = [
|
307 |
+
{"wait_until": "commit", "timeout": 15000},
|
308 |
+
{"wait_until": "domcontentloaded", "timeout": 10000},
|
309 |
+
{"wait_until": "load", "timeout": 20000},
|
310 |
+
{"wait_until": "networkidle", "timeout": 30000}
|
311 |
+
]
|
312 |
+
|
313 |
+
for i, strategy in enumerate(strategies):
|
314 |
+
try:
|
315 |
+
print(f"Trying navigation strategy {i+1}: {strategy}")
|
316 |
+
response = page.goto(url, **strategy)
|
317 |
+
print(f"Navigation successful with strategy {i+1}")
|
318 |
+
return response
|
319 |
+
except Exception as e:
|
320 |
+
print(f"Strategy {i+1} failed: {e}")
|
321 |
+
if "ERR_TOO_MANY_REDIRECTS" in str(e):
|
322 |
+
print(f"Redirect error detected, trying next strategy...")
|
323 |
+
continue
|
324 |
+
elif i == len(strategies) - 1: # Last strategy
|
325 |
+
raise e
|
326 |
+
continue
|
327 |
+
|
328 |
+
raise Exception("All navigation strategies failed")
|
329 |
+
|
330 |
def take_screenshot(url):
|
331 |
url = ensure_http(url)
|
332 |
filename = sanitize_filename(url) + '.png'
|
333 |
filepath = os.path.join(SCREENSHOT_DIR, filename)
|
334 |
|
335 |
+
max_retries = 3
|
336 |
+
|
337 |
+
for attempt in range(max_retries):
|
338 |
+
try:
|
339 |
+
print(f"\n=== [SCREENSHOT ATTEMPT {attempt + 1}/{max_retries}] URL: {url} ===")
|
|
|
|
|
|
|
|
|
|
|
340 |
|
341 |
+
with sync_playwright() as p:
|
342 |
+
print("Launching browser with aggressive configuration...")
|
343 |
+
context = create_browser_context(p)
|
344 |
+
page = context.new_page()
|
|
|
|
|
|
|
345 |
|
346 |
+
# Only set up basic request blocking for this attempt
|
347 |
+
if attempt == 0:
|
348 |
+
print("Setting up basic request interception...")
|
349 |
+
def simple_block(route):
|
350 |
+
url_lower = route.request.url.lower()
|
351 |
+
if any(pattern in url_lower for pattern in BLOCK_PATTERNS):
|
352 |
+
route.abort()
|
353 |
+
else:
|
354 |
+
route.continue_()
|
355 |
+
page.route("**/*", simple_block)
|
356 |
|
|
|
357 |
try:
|
358 |
+
# Try different navigation strategies
|
359 |
+
if attempt == 0:
|
360 |
+
# First attempt: aggressive but safe
|
361 |
+
response = try_navigation_strategies(page, url)
|
362 |
+
elif attempt == 1:
|
363 |
+
# Second attempt: minimal approach
|
364 |
+
print("Trying minimal navigation approach...")
|
365 |
+
response = page.goto(url, wait_until="commit", timeout=10000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
else:
|
367 |
+
# Third attempt: just try to load anything
|
368 |
+
print("Trying basic navigation...")
|
369 |
+
response = page.goto(url, timeout=15000)
|
370 |
+
|
371 |
+
if response:
|
372 |
+
print(f"Response status: {response.status}")
|
373 |
+
|
374 |
+
# Try to wait for some content
|
375 |
+
try:
|
376 |
+
page.wait_for_timeout(3000) # Just wait 3 seconds
|
377 |
+
if attempt == 0:
|
378 |
+
wait_for_page_stable(page)
|
379 |
+
except Exception as e:
|
380 |
+
print(f"Page stability warning: {e}")
|
381 |
+
|
382 |
+
# Take screenshot
|
383 |
+
print("Taking screenshot...")
|
384 |
+
page.screenshot(path=filepath)
|
385 |
+
|
386 |
+
# If we get here, screenshot was successful
|
387 |
context.close()
|
388 |
+
print(f"Screenshot saved successfully to {filepath}")
|
389 |
+
return filepath
|
390 |
+
|
391 |
+
except Exception as nav_error:
|
392 |
+
print(f"Navigation error on attempt {attempt + 1}: {nav_error}")
|
393 |
+
|
394 |
+
# Try to take screenshot of whatever we have
|
395 |
+
try:
|
396 |
+
if page.url != "about:blank":
|
397 |
+
print("Taking screenshot of partial page...")
|
398 |
+
page.screenshot(path=filepath)
|
399 |
+
context.close()
|
400 |
+
if os.path.exists(filepath):
|
401 |
+
print(f"Partial screenshot saved to {filepath}")
|
402 |
+
return filepath
|
403 |
+
except Exception as screenshot_error:
|
404 |
+
print(f"Failed to take partial screenshot: {screenshot_error}")
|
405 |
+
|
406 |
+
context.close()
|
407 |
+
|
408 |
+
# If this is the last attempt, raise the error
|
409 |
+
if attempt == max_retries - 1:
|
410 |
+
raise nav_error
|
411 |
+
else:
|
412 |
+
print(f"Retrying with different approach...")
|
413 |
+
time.sleep(2) # Wait before retry
|
414 |
+
continue
|
415 |
|
416 |
+
except Exception as e:
|
417 |
+
print(f"[ERROR] Attempt {attempt + 1} failed: {e}")
|
418 |
+
if attempt == max_retries - 1:
|
419 |
+
print(f"All {max_retries} attempts failed for URL: {url}")
|
420 |
+
traceback.print_exc()
|
421 |
+
return None
|
422 |
+
else:
|
423 |
+
print("Waiting before next attempt...")
|
424 |
+
time.sleep(3)
|
425 |
+
continue
|
426 |
+
|
427 |
+
return None
|
428 |
|
429 |
def resize_if_needed(image_path, max_mb=1, target_width=720):
|
430 |
file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
|