Azzan Dwi Riski
commited on
Commit
·
bf89d59
1
Parent(s):
277db83
update the code to handle ads and cloudflare challenge
Browse files
app.py
CHANGED
@@ -20,6 +20,53 @@ from pathlib import Path
|
|
20 |
import subprocess
|
21 |
import traceback
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# --- Setup ---
|
24 |
|
25 |
# Device setup
|
@@ -175,29 +222,53 @@ def clean_text(text):
|
|
175 |
|
176 |
# Fungsi untuk mengambil screenshot viewport
|
177 |
def take_screenshot(url):
|
178 |
-
|
|
|
179 |
filepath = os.path.join(SCREENSHOT_DIR, filename)
|
180 |
|
181 |
try:
|
182 |
print(f"\n=== [START SCREENSHOT] URL: {url} ===")
|
183 |
-
from playwright.sync_api import sync_playwright
|
184 |
-
|
185 |
with sync_playwright() as p:
|
186 |
print("Launching Playwright Chromium...")
|
187 |
-
browser = p.chromium.launch(
|
188 |
-
|
|
|
|
|
|
|
189 |
viewport={"width": 1280, "height": 800},
|
190 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
191 |
)
|
192 |
-
|
|
|
|
|
193 |
page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
page
|
198 |
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
browser.close()
|
202 |
print(f"Screenshot saved to {filepath}")
|
203 |
|
|
|
20 |
import subprocess
|
21 |
import traceback
|
22 |
|
23 |
+
# =============================================
|
24 |
+
# CONFIGURATION
|
25 |
+
# =============================================
|
26 |
+
|
27 |
+
BLOCK_PATTERNS = ["doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent"]
|
28 |
+
PAGE_TIMEOUT = 60000 # 60 seconds
|
29 |
+
WAIT_FOR_LOAD_TIMEOUT = 10000 # 10 seconds extra wait after load
|
30 |
+
CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
|
31 |
+
|
32 |
+
# =============================================
|
33 |
+
# HELPER FUNCTIONS
|
34 |
+
# =============================================
|
35 |
+
|
36 |
+
def ensure_http(url):
|
37 |
+
if not url.startswith(('http://', 'https://')):
|
38 |
+
return 'http://' + url
|
39 |
+
return url
|
40 |
+
|
41 |
+
def sanitize_filename(url):
|
42 |
+
return re.sub(r'[^\w\-_\. ]', '_', url)
|
43 |
+
|
44 |
+
def block_ads_and_cookies(page):
|
45 |
+
def route_intercept(route):
|
46 |
+
if any(resource in route.request.url.lower() for resource in BLOCK_PATTERNS):
|
47 |
+
route.abort()
|
48 |
+
else:
|
49 |
+
route.continue_()
|
50 |
+
page.route("**/*", route_intercept)
|
51 |
+
|
52 |
+
def wait_for_page_stable(page):
|
53 |
+
try:
|
54 |
+
page.wait_for_load_state('networkidle', timeout=PAGE_TIMEOUT)
|
55 |
+
time.sleep(WAIT_FOR_LOAD_TIMEOUT / 1000) # extra wait
|
56 |
+
except Exception as e:
|
57 |
+
print(f"⚠️ Page not fully stable: {e}")
|
58 |
+
|
59 |
+
def detect_and_bypass_cloudflare(page):
|
60 |
+
try:
|
61 |
+
content = page.content()
|
62 |
+
if any(keyword.lower() in content.lower() for keyword in CLOUDFLARE_CHECK_KEYWORDS):
|
63 |
+
print("⚡ Detected Cloudflare challenge, waiting 5 seconds...")
|
64 |
+
time.sleep(5)
|
65 |
+
page.reload()
|
66 |
+
wait_for_page_stable(page)
|
67 |
+
except Exception as e:
|
68 |
+
print(f"⚠️ Failed to bypass Cloudflare: {e}")
|
69 |
+
|
70 |
# --- Setup ---
|
71 |
|
72 |
# Device setup
|
|
|
222 |
|
223 |
# Fungsi untuk mengambil screenshot viewport
|
224 |
def take_screenshot(url):
|
225 |
+
url = ensure_http(url)
|
226 |
+
filename = sanitize_filename(url) + '.png'
|
227 |
filepath = os.path.join(SCREENSHOT_DIR, filename)
|
228 |
|
229 |
try:
|
230 |
print(f"\n=== [START SCREENSHOT] URL: {url} ===")
|
|
|
|
|
231 |
with sync_playwright() as p:
|
232 |
print("Launching Playwright Chromium...")
|
233 |
+
browser = p.chromium.launch(
|
234 |
+
args=['--disable-features=IsolateOrigins,site-per-process'] # Disable site isolation
|
235 |
+
)
|
236 |
+
|
237 |
+
context = browser.new_context(
|
238 |
viewport={"width": 1280, "height": 800},
|
239 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
240 |
)
|
241 |
+
|
242 |
+
page = context.new_page()
|
243 |
+
page.set_default_timeout(PAGE_TIMEOUT)
|
244 |
page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
|
245 |
|
246 |
+
# Block ads and tracking
|
247 |
+
print("Setting up ad and tracking blockers...")
|
248 |
+
block_ads_and_cookies(page)
|
249 |
|
250 |
+
try:
|
251 |
+
print("Navigating to URL...")
|
252 |
+
page.goto(url, wait_until="domcontentloaded", timeout=PAGE_TIMEOUT)
|
253 |
+
|
254 |
+
# Handle potential Cloudflare protection
|
255 |
+
detect_and_bypass_cloudflare(page)
|
256 |
+
|
257 |
+
# Wait for page to be stable
|
258 |
+
wait_for_page_stable(page)
|
259 |
+
|
260 |
+
print("Taking screenshot (viewport only)...")
|
261 |
+
page.screenshot(path=filepath)
|
262 |
+
|
263 |
+
except Exception as nav_error:
|
264 |
+
print(f"Navigation error: {nav_error}")
|
265 |
+
# Try to take screenshot anyway if page partially loaded
|
266 |
+
if page.url != "about:blank":
|
267 |
+
page.screenshot(path=filepath)
|
268 |
+
else:
|
269 |
+
raise nav_error
|
270 |
+
|
271 |
+
context.close()
|
272 |
browser.close()
|
273 |
print(f"Screenshot saved to {filepath}")
|
274 |
|