Azzan Dwi Riski commited on
Commit
bf89d59
·
1 Parent(s): 277db83

update the code to handle ads and cloudflare challenge

Browse files
Files changed (1) hide show
  1. app.py +82 -11
app.py CHANGED
@@ -20,6 +20,53 @@ from pathlib import Path
20
  import subprocess
21
  import traceback
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # --- Setup ---
24
 
25
  # Device setup
@@ -175,29 +222,53 @@ def clean_text(text):
175
 
176
  # Fungsi untuk mengambil screenshot viewport
177
  def take_screenshot(url):
178
- filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_') + '.png'
 
179
  filepath = os.path.join(SCREENSHOT_DIR, filename)
180
 
181
  try:
182
  print(f"\n=== [START SCREENSHOT] URL: {url} ===")
183
- from playwright.sync_api import sync_playwright
184
-
185
  with sync_playwright() as p:
186
  print("Launching Playwright Chromium...")
187
- browser = p.chromium.launch()
188
- page = browser.new_page(
 
 
 
189
  viewport={"width": 1280, "height": 800},
190
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
191
  )
192
- page.set_default_timeout(60000)
 
 
193
  page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
194
 
195
- print("Navigating to URL...")
196
- page.goto(url, wait_until="networkidle", timeout=60000)
197
- page.wait_for_timeout(3000)
198
 
199
- print("Taking screenshot (viewport only)...")
200
- page.screenshot(path=filepath)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  browser.close()
202
  print(f"Screenshot saved to {filepath}")
203
 
 
20
  import subprocess
21
  import traceback
22
 
23
+ # =============================================
24
+ # CONFIGURATION
25
+ # =============================================
26
+
27
+ BLOCK_PATTERNS = ["doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent"]
28
+ PAGE_TIMEOUT = 60000 # 60 seconds
29
+ WAIT_FOR_LOAD_TIMEOUT = 10000 # 10 seconds extra wait after load
30
+ CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
31
+
32
+ # =============================================
33
+ # HELPER FUNCTIONS
34
+ # =============================================
35
+
36
+ def ensure_http(url):
37
+ if not url.startswith(('http://', 'https://')):
38
+ return 'http://' + url
39
+ return url
40
+
41
+ def sanitize_filename(url):
42
+ return re.sub(r'[^\w\-_\. ]', '_', url)
43
+
44
+ def block_ads_and_cookies(page):
45
+ def route_intercept(route):
46
+ if any(resource in route.request.url.lower() for resource in BLOCK_PATTERNS):
47
+ route.abort()
48
+ else:
49
+ route.continue_()
50
+ page.route("**/*", route_intercept)
51
+
52
+ def wait_for_page_stable(page):
53
+ try:
54
+ page.wait_for_load_state('networkidle', timeout=PAGE_TIMEOUT)
55
+ time.sleep(WAIT_FOR_LOAD_TIMEOUT / 1000) # extra wait
56
+ except Exception as e:
57
+ print(f"⚠️ Page not fully stable: {e}")
58
+
59
+ def detect_and_bypass_cloudflare(page):
60
+ try:
61
+ content = page.content()
62
+ if any(keyword.lower() in content.lower() for keyword in CLOUDFLARE_CHECK_KEYWORDS):
63
+ print("⚡ Detected Cloudflare challenge, waiting 5 seconds...")
64
+ time.sleep(5)
65
+ page.reload()
66
+ wait_for_page_stable(page)
67
+ except Exception as e:
68
+ print(f"⚠️ Failed to bypass Cloudflare: {e}")
69
+
70
  # --- Setup ---
71
 
72
  # Device setup
 
222
 
223
  # Fungsi untuk mengambil screenshot viewport
224
  def take_screenshot(url):
225
+ url = ensure_http(url)
226
+ filename = sanitize_filename(url) + '.png'
227
  filepath = os.path.join(SCREENSHOT_DIR, filename)
228
 
229
  try:
230
  print(f"\n=== [START SCREENSHOT] URL: {url} ===")
 
 
231
  with sync_playwright() as p:
232
  print("Launching Playwright Chromium...")
233
+ browser = p.chromium.launch(
234
+ args=['--disable-features=IsolateOrigins,site-per-process'] # Disable site isolation
235
+ )
236
+
237
+ context = browser.new_context(
238
  viewport={"width": 1280, "height": 800},
239
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
240
  )
241
+
242
+ page = context.new_page()
243
+ page.set_default_timeout(PAGE_TIMEOUT)
244
  page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
245
 
246
+ # Block ads and tracking
247
+ print("Setting up ad and tracking blockers...")
248
+ block_ads_and_cookies(page)
249
 
250
+ try:
251
+ print("Navigating to URL...")
252
+ page.goto(url, wait_until="domcontentloaded", timeout=PAGE_TIMEOUT)
253
+
254
+ # Handle potential Cloudflare protection
255
+ detect_and_bypass_cloudflare(page)
256
+
257
+ # Wait for page to be stable
258
+ wait_for_page_stable(page)
259
+
260
+ print("Taking screenshot (viewport only)...")
261
+ page.screenshot(path=filepath)
262
+
263
+ except Exception as nav_error:
264
+ print(f"Navigation error: {nav_error}")
265
+ # Try to take screenshot anyway if page partially loaded
266
+ if page.url != "about:blank":
267
+ page.screenshot(path=filepath)
268
+ else:
269
+ raise nav_error
270
+
271
+ context.close()
272
  browser.close()
273
  print(f"Screenshot saved to {filepath}")
274