Spaces:
Runtime error
Runtime error
from __future__ import annotations | |
import sys | |
import os | |
import time | |
import csv | |
import json | |
import dataclasses | |
from dataclasses import dataclass, asdict | |
from typing import List, Dict, Optional, Tuple | |
from urllib.parse import quote_plus | |
from bs4 import BeautifulSoup # type: ignore | |
import pandas as pd # type: ignore | |
from selenium import webdriver # type: ignore | |
from selenium.webdriver.common.by import By # type: ignore | |
from selenium.webdriver.chrome.service import Service # type: ignore | |
from selenium.webdriver.chrome.options import Options # type: ignore | |
from selenium.webdriver.support.ui import WebDriverWait # type: ignore | |
from selenium.webdriver.support import expected_conditions as EC # type: ignore | |
from selenium.common.exceptions import TimeoutException, NoSuchElementException # type: ignore | |
# ----------------------------- | |
# Data model | |
# ----------------------------- | |
class AdRecord: | |
source: str # "facebook_ad_library" / "instagram" | |
advertiser: str # Page/Account name if detectable | |
ad_text: str # Primary text we could capture | |
ad_link: str # Link to the ad details or outbound | |
media_urls: List[str] # Images/videos if easily captured | |
timestamp: float # When we scraped | |
# ----------------------------- | |
# Pure HTML parsing utils (unit-testable, no Selenium) | |
# ----------------------------- | |
SAMPLE_AD_HTML = """ | |
<div class="ad-card"> | |
<div class="header"><span>Sponsored</span> · <a href="https://facebook.com/SomeBrand">SomeBrand</a></div> | |
<div class="body">Glow faster with our GenZ serum! #skincare #genz</div> | |
<div class="footer"><a href="https://example.com/buy-now">Shop Now</a></div> | |
</div> | |
<div class="ad-card"> | |
<div class="header"><span>Sponsored</span> · <a href="https://facebook.com/AnotherBrand">AnotherBrand</a></div> | |
<div class="body">Meet the new foam cleanser — gentle, effective, and clean.</div> | |
<div class="footer"><a href="https://example.com/learn-more">Learn More</a></div> | |
</div> | |
""" | |
def extract_ads_from_html(html: str) -> List[AdRecord]: | |
"""Best-effort extraction from generic ad-like HTML (for testing). | |
Looks for blocks that contain the word 'Sponsored'. | |
""" | |
soup = BeautifulSoup(html, "html.parser") | |
ads: List[AdRecord] = [] | |
candidates = soup.find_all(string=lambda s: isinstance(s, str) and "sponsored" in s.lower()) | |
seen_blocks = set() | |
for node in candidates: | |
block = node | |
# Climb to a container block | |
for _ in range(3): | |
if block and block.parent: | |
block = block.parent | |
if not block or id(block) in seen_blocks: | |
continue | |
seen_blocks.add(id(block)) | |
advertiser = "" | |
adv_a = block.find("a") | |
if adv_a and adv_a.text: | |
advertiser = adv_a.text.strip() | |
text_parts = [] | |
body = block.find(class_="body") | |
if body and body.text: | |
text_parts.append(body.text.strip()) | |
else: | |
text_parts.append(block.get_text(" ", strip=True)) | |
ad_text = " ".join(text_parts)[:5000] | |
link = "" | |
a_tag = block.find("a", href=True) | |
if a_tag: | |
link = a_tag["href"] | |
ads.append( | |
AdRecord( | |
source="html_sample", | |
advertiser=advertiser, | |
ad_text=ad_text, | |
ad_link=link, | |
media_urls=[], | |
timestamp=time.time(), | |
) | |
) | |
return ads | |
# ----------------------------- | |
# Selenium scraping (Meta Ad Library) | |
# ----------------------------- | |
def _build_chrome(headless: bool = True) -> webdriver.Chrome: | |
opts = Options() | |
if headless: | |
# "new" headless is more stable in recent Chrome | |
opts.add_argument("--headless=new") | |
opts.add_argument("--no-sandbox") | |
opts.add_argument("--disable-dev-shm-usage") | |
opts.add_argument("--window-size=1600,1200") | |
# 1) Try system ChromeDriver first (recommended for CI/sandboxes without ssl) | |
try: | |
return webdriver.Chrome(options=opts) | |
except Exception as e1: | |
# 2) Fallback to webdriver_manager (requires internet & ssl). Do inside try. | |
try: | |
from webdriver_manager.chrome import ChromeDriverManager # type: ignore | |
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts) | |
except Exception as e2: | |
raise RuntimeError( | |
"Failed to start ChromeDriver. Ensure Chrome/Chromium + ChromeDriver are installed " | |
"and on PATH, or run in an environment with internet/SSL for webdriver_manager.\n" | |
f"System driver error: {e1}\nManager error: {e2}" | |
) | |
def scrape_meta_ad_library( | |
keyword: str, | |
country: str = "IN", | |
max_ads: int = 20, | |
headless: bool = True, | |
scroll_rounds: int = 8, | |
per_scroll_pause: float = 2.5, | |
) -> List[AdRecord]: | |
"""Scrape Facebook Ad Library search results for a keyword. | |
NOTE: Selectors on facebook.com change frequently; this is best-effort and may | |
require updates. Works best when you're logged in and have accepted cookies. | |
""" | |
driver = _build_chrome(headless=headless) | |
try: | |
base = ( | |
"https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=" | |
f"{country}&q={quote_plus(keyword)}&search_type=keyword" | |
) | |
driver.get(base) | |
# Give time for cookie banners or initial JS | |
time.sleep(4) | |
# Try to accept cookies if a button exists | |
try: | |
WebDriverWait(driver, 5).until( | |
EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Allow')] | //button[contains(., 'Accept')]")) | |
).click() | |
time.sleep(2) | |
except Exception: | |
pass | |
# Scroll to load more results | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
for _ in range(scroll_rounds): | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(per_scroll_pause) | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
last_height = new_height | |
cards = [] | |
# Heuristic selectors (may require tweaks over time) | |
selectors = [ | |
"div[role='article']", # generic card | |
"div.x1lliihq.x1n2onr6", # fb new class soup (example) | |
"div._99s5", # legacy ad card | |
] | |
seen = set() | |
for sel in selectors: | |
cards.extend(driver.find_elements(By.CSS_SELECTOR, sel)) | |
records: List[AdRecord] = [] | |
for el in cards: | |
try: | |
text = el.text.strip() | |
if not text: | |
continue | |
# Require some signal that it's an ad | |
if "Sponsored" not in text and "Ad details" not in text and "Why am I seeing this ad" not in text: | |
continue | |
advertiser = "" | |
try: | |
adv = el.find_element(By.XPATH, ".//a[starts-with(@href, 'https://www.facebook.com/')]") | |
advertiser = adv.text.strip() | |
except Exception: | |
pass | |
ad_link = "" | |
try: | |
link = el.find_element(By.XPATH, ".//a[@href and contains(@href, 'ads/library')]") | |
ad_link = link.get_attribute("href") | |
except Exception: | |
# fallback: first link | |
try: | |
link = el.find_element(By.XPATH, ".//a[@href]") | |
ad_link = link.get_attribute("href") | |
except Exception: | |
pass | |
key = (advertiser, ad_link, hash(text)) | |
if key in seen: | |
continue | |
seen.add(key) | |
records.append( | |
AdRecord( | |
source="facebook_ad_library", | |
advertiser=advertiser, | |
ad_text=text[:5000], | |
ad_link=ad_link, | |
media_urls=[], | |
timestamp=time.time(), | |
) | |
) | |
if len(records) >= max_ads: | |
break | |
except Exception: | |
continue | |
return records | |
finally: | |
driver.quit() | |
# ----------------------------- | |
# CSV/DF helpers | |
# ----------------------------- | |
def records_to_dataframe(records: List[AdRecord]) -> pd.DataFrame: | |
return pd.DataFrame([asdict(r) for r in records]) | |
def save_records_csv(records: List[AdRecord], path: str = "ads_results.csv") -> str: | |
df = records_to_dataframe(records) | |
df.to_csv(path, index=False) | |
return path | |
# ----------------------------- | |
# Optional Gradio UI (lazy import to avoid ssl at import time) | |
# ----------------------------- | |
def launch_gradio_ui(): | |
try: | |
import gradio as gr # type: ignore | |
except Exception as e: # Includes ModuleNotFoundError: ssl | |
print( | |
"[WARN] Gradio could not be imported (likely due to missing ssl in this environment).\n" | |
" You can still use the CLI: python ad_scraper.py --keyword 'your term'\n" | |
f" Import error: {e}" | |
) | |
return | |
def _scrape(keyword: str, country: str, max_ads: int, headless: bool): | |
if not keyword.strip(): | |
return pd.DataFrame(), None | |
try: | |
records = scrape_meta_ad_library(keyword=keyword.strip(), country=country, max_ads=max_ads, headless=headless) | |
except Exception as e: | |
# Show error in a friendly way | |
err_df = pd.DataFrame([[str(e)]], columns=["Error"]) | |
return err_df, None | |
if not records: | |
return pd.DataFrame(columns=[f"No results for '{keyword}'"]), None | |
csv_path = save_records_csv(records) | |
return records_to_dataframe(records), csv_path | |
with gr.Blocks() as demo: | |
gr.Markdown("# 📢 Meta Ad Library Scraper (Selenium)\nEnter a keyword to fetch matching ads.") | |
with gr.Row(): | |
kw = gr.Textbox(label="Keyword", value="GenZ skin care brand") | |
with gr.Row(): | |
country = gr.Dropdown(["IN","US","GB","CA","AU"], value="IN", label="Country") | |
max_ads = gr.Slider(1, 100, value=20, step=1, label="Max Ads") | |
headless = gr.Checkbox(value=True, label="Headless browser") | |
btn = gr.Button("Scrape") | |
out_df = gr.Dataframe(label="Results", interactive=False) | |
out_file = gr.File(label="Download CSV") | |
btn.click(_scrape, inputs=[kw, country, max_ads, headless], outputs=[out_df, out_file]) | |
demo.launch() | |
# ----------------------------- | |
# CLI Interface | |
# ----------------------------- | |
def main_cli(argv: List[str]): | |
import argparse | |
p = argparse.ArgumentParser(description="Meta Ad Library scraper with Selenium (Gradio optional)") | |
p.add_argument("--keyword", "-k", type=str, default="GenZ skin care brand", help="Search keyword") | |
p.add_argument("--country", "-c", type=str, default="IN", help="Country code (e.g., IN, US)") | |
p.add_argument("--max-ads", type=int, default=20, help="Max ads to collect") | |
p.add_argument("--no-headless", action="store_true", help="Run browser with a window") | |
p.add_argument("--gradio", action="store_true", help="Launch Gradio UI (requires ssl)") | |
p.add_argument("--test", action="store_true", help="Run unit tests for HTML parsing") | |
args = p.parse_args(argv) | |
if args.gradio: | |
launch_gradio_ui() | |
return | |
if args.test: | |
run_unit_tests() | |
return | |
print(f"[INFO] Scraping Meta Ad Library for keyword='{args.keyword}' in country='{args.country}'...") | |
try: | |
records = scrape_meta_ad_library( | |
keyword=args.keyword, | |
country=args.country, | |
max_ads=args.max_ads, | |
headless=not args.no_headless, | |
) | |
except Exception as e: | |
print("[ERROR] Scrape failed:", e) | |
print("Tip: Ensure Chrome + ChromeDriver are installed and on PATH, or re-run with --gradio in an env that has ssl.") | |
return | |
if not records: | |
print("[INFO] No ads found.") | |
return | |
csv_path = save_records_csv(records) | |
print(f"[INFO] Saved {len(records)} ads to {csv_path}") | |
# ----------------------------- | |
# Tests (no network/browser needed) | |
# ----------------------------- | |
def run_unit_tests(): | |
print("[TEST] Running HTML parsing tests...") | |
ads = extract_ads_from_html(SAMPLE_AD_HTML) | |
assert len(ads) >= 2, "Expected at least 2 ads from sample HTML" | |
assert any("GenZ serum" in a.ad_text for a in ads), "Should capture sample ad body text" | |
assert any("SomeBrand" in a.advertiser for a in ads), "Should capture advertiser name" | |
print("[TEST] OK — basic HTML extraction works.") | |
if __name__ == "__main__": | |
main_cli(sys.argv[1:]) |