Azzan Dwi Riski
commited on
Commit
·
c0af825
1
Parent(s):
bf89d59
update the code to handle ads and cloudflare challenge fixed
Browse files
app.py
CHANGED
@@ -24,10 +24,15 @@ import traceback
|
|
24 |
# CONFIGURATION
|
25 |
# =============================================
|
26 |
|
27 |
-
BLOCK_PATTERNS = [
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
|
|
|
31 |
|
32 |
# =============================================
|
33 |
# HELPER FUNCTIONS
|
@@ -51,8 +56,17 @@ def block_ads_and_cookies(page):
|
|
51 |
|
52 |
def wait_for_page_stable(page):
|
53 |
try:
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
except Exception as e:
|
57 |
print(f"⚠️ Page not fully stable: {e}")
|
58 |
|
@@ -220,7 +234,57 @@ def clean_text(text):
|
|
220 |
return "" # empty return to use image-only
|
221 |
return text
|
222 |
|
223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
def take_screenshot(url):
|
225 |
url = ensure_http(url)
|
226 |
filename = sanitize_filename(url) + '.png'
|
@@ -228,52 +292,54 @@ def take_screenshot(url):
|
|
228 |
|
229 |
try:
|
230 |
print(f"\n=== [START SCREENSHOT] URL: {url} ===")
|
|
|
231 |
with sync_playwright() as p:
|
232 |
-
print("Launching
|
233 |
-
|
234 |
-
|
235 |
-
)
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
240 |
-
)
|
241 |
|
242 |
-
page = context.new_page()
|
243 |
-
page.set_default_timeout(PAGE_TIMEOUT)
|
244 |
-
page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
|
245 |
-
|
246 |
-
# Block ads and tracking
|
247 |
-
print("Setting up ad and tracking blockers...")
|
248 |
-
block_ads_and_cookies(page)
|
249 |
-
|
250 |
try:
|
251 |
-
print("
|
252 |
-
page.goto(
|
|
|
|
|
|
|
|
|
253 |
|
254 |
-
|
255 |
-
|
|
|
|
|
256 |
|
257 |
-
#
|
258 |
wait_for_page_stable(page)
|
259 |
|
260 |
-
|
|
|
261 |
page.screenshot(path=filepath)
|
262 |
|
263 |
except Exception as nav_error:
|
264 |
print(f"Navigation error: {nav_error}")
|
265 |
-
# Try to take screenshot anyway if
|
266 |
-
|
267 |
-
page.
|
268 |
-
|
|
|
|
|
|
|
|
|
269 |
raise nav_error
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
|
|
277 |
|
278 |
except Exception as e:
|
279 |
print(f"[ERROR] Failed to take screenshot for URL: {url}")
|
|
|
24 |
# CONFIGURATION
|
25 |
# =============================================
|
26 |
|
27 |
+
BLOCK_PATTERNS = [
|
28 |
+
"doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent",
|
29 |
+
"analytics", "tracker", "tracking", "stats", "metric", "telemetry", "social", "facebook",
|
30 |
+
"twitter", "linkedin", "pinterest", "popup", "notification", "banner"
|
31 |
+
]
|
32 |
+
PAGE_TIMEOUT = 30000 # reduced to 30 seconds
|
33 |
+
WAIT_FOR_LOAD_TIMEOUT = 5000 # reduced to 5 seconds
|
34 |
CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
|
35 |
+
MAX_REDIRECTS = 5 # Maximum number of redirects to follow
|
36 |
|
37 |
# =============================================
|
38 |
# HELPER FUNCTIONS
|
|
|
56 |
|
57 |
def wait_for_page_stable(page):
|
58 |
try:
|
59 |
+
# First wait for DOM content
|
60 |
+
page.wait_for_load_state('domcontentloaded', timeout=PAGE_TIMEOUT)
|
61 |
+
|
62 |
+
# Then wait for network to be idle
|
63 |
+
try:
|
64 |
+
page.wait_for_load_state('networkidle', timeout=WAIT_FOR_LOAD_TIMEOUT)
|
65 |
+
except:
|
66 |
+
print("Network not fully idle, continuing anyway...")
|
67 |
+
|
68 |
+
# Small additional wait
|
69 |
+
time.sleep(2)
|
70 |
except Exception as e:
|
71 |
print(f"⚠️ Page not fully stable: {e}")
|
72 |
|
|
|
234 |
return "" # empty return to use image-only
|
235 |
return text
|
236 |
|
237 |
+
def create_browser_context(playwright):
|
238 |
+
return playwright.chromium.launch(
|
239 |
+
args=[
|
240 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
241 |
+
'--disable-web-security',
|
242 |
+
'--disable-site-isolation-trials',
|
243 |
+
'--disable-setuid-sandbox',
|
244 |
+
'--no-sandbox',
|
245 |
+
'--disable-gpu',
|
246 |
+
'--disable-dev-shm-usage'
|
247 |
+
]
|
248 |
+
).new_context(
|
249 |
+
viewport={"width": 1280, "height": 800},
|
250 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
|
251 |
+
ignore_https_errors=True,
|
252 |
+
java_script_enabled=True,
|
253 |
+
bypass_csp=True,
|
254 |
+
extra_http_headers={
|
255 |
+
"Accept-Language": "en-US,en;q=0.9",
|
256 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
257 |
+
"Connection": "keep-alive",
|
258 |
+
"DNT": "1"
|
259 |
+
}
|
260 |
+
)
|
261 |
+
|
262 |
+
def setup_request_interception(page):
|
263 |
+
redirect_count = 0
|
264 |
+
|
265 |
+
def handle_request(route):
|
266 |
+
nonlocal redirect_count
|
267 |
+
request = route.request
|
268 |
+
|
269 |
+
# Block known ad/tracking patterns
|
270 |
+
if any(pattern in request.url.lower() for pattern in BLOCK_PATTERNS):
|
271 |
+
print(f"Blocking request to: {request.url}")
|
272 |
+
route.abort()
|
273 |
+
return
|
274 |
+
|
275 |
+
# Handle redirects
|
276 |
+
if request.redirect_chain:
|
277 |
+
redirect_count += 1
|
278 |
+
if redirect_count > MAX_REDIRECTS:
|
279 |
+
print(f"Too many redirects ({redirect_count}), aborting request")
|
280 |
+
route.abort()
|
281 |
+
return
|
282 |
+
|
283 |
+
# Continue with the request
|
284 |
+
route.continue_()
|
285 |
+
|
286 |
+
page.route("**/*", handle_request)
|
287 |
+
|
288 |
def take_screenshot(url):
|
289 |
url = ensure_http(url)
|
290 |
filename = sanitize_filename(url) + '.png'
|
|
|
292 |
|
293 |
try:
|
294 |
print(f"\n=== [START SCREENSHOT] URL: {url} ===")
|
295 |
+
|
296 |
with sync_playwright() as p:
|
297 |
+
print("Launching browser with custom configuration...")
|
298 |
+
context = create_browser_context(p)
|
299 |
+
page = context.new_page()
|
|
|
300 |
|
301 |
+
print("Setting up request interception...")
|
302 |
+
setup_request_interception(page)
|
|
|
|
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
try:
|
305 |
+
print("Attempting to navigate to URL...")
|
306 |
+
response = page.goto(
|
307 |
+
url,
|
308 |
+
wait_until="commit", # Changed to commit instead of domcontentloaded
|
309 |
+
timeout=PAGE_TIMEOUT
|
310 |
+
)
|
311 |
|
312 |
+
if not response:
|
313 |
+
print("No response received, attempting to continue...")
|
314 |
+
elif response.status >= 400:
|
315 |
+
print(f"Received error status code: {response.status}")
|
316 |
|
317 |
+
# Try to wait for the page to be more stable
|
318 |
wait_for_page_stable(page)
|
319 |
|
320 |
+
# Take screenshot even if page might not be fully loaded
|
321 |
+
print("Taking screenshot...")
|
322 |
page.screenshot(path=filepath)
|
323 |
|
324 |
except Exception as nav_error:
|
325 |
print(f"Navigation error: {nav_error}")
|
326 |
+
# Try to take screenshot anyway if we have any content
|
327 |
+
try:
|
328 |
+
if page.url != "about:blank":
|
329 |
+
print("Taking screenshot of partial page...")
|
330 |
+
page.screenshot(path=filepath)
|
331 |
+
else:
|
332 |
+
raise nav_error
|
333 |
+
except:
|
334 |
raise nav_error
|
335 |
+
finally:
|
336 |
+
context.close()
|
337 |
+
|
338 |
+
if os.path.exists(filepath):
|
339 |
+
print(f"Screenshot saved successfully to {filepath}")
|
340 |
+
return filepath
|
341 |
+
else:
|
342 |
+
raise Exception("Screenshot file was not created")
|
343 |
|
344 |
except Exception as e:
|
345 |
print(f"[ERROR] Failed to take screenshot for URL: {url}")
|