from __future__ import annotations import sys import os import time import csv import json import dataclasses from dataclasses import dataclass, asdict from typing import List, Dict, Optional, Tuple from urllib.parse import quote_plus from bs4 import BeautifulSoup # type: ignore import pandas as pd # type: ignore from selenium import webdriver # type: ignore from selenium.webdriver.common.by import By # type: ignore from selenium.webdriver.chrome.service import Service # type: ignore from selenium.webdriver.chrome.options import Options # type: ignore from selenium.webdriver.support.ui import WebDriverWait # type: ignore from selenium.webdriver.support import expected_conditions as EC # type: ignore from selenium.common.exceptions import TimeoutException, NoSuchElementException # type: ignore # ----------------------------- # Data model # ----------------------------- @dataclass class AdRecord: source: str # "facebook_ad_library" / "instagram" advertiser: str # Page/Account name if detectable ad_text: str # Primary text we could capture ad_link: str # Link to the ad details or outbound media_urls: List[str] # Images/videos if easily captured timestamp: float # When we scraped # ----------------------------- # Pure HTML parsing utils (unit-testable, no Selenium) # ----------------------------- SAMPLE_AD_HTML = """
Sponsored · SomeBrand
Glow faster with our GenZ serum! #skincare #genz
Sponsored · AnotherBrand
Meet the new foam cleanser — gentle, effective, and clean.
""" def extract_ads_from_html(html: str) -> List[AdRecord]: """Best-effort extraction from generic ad-like HTML (for testing). Looks for blocks that contain the word 'Sponsored'. """ soup = BeautifulSoup(html, "html.parser") ads: List[AdRecord] = [] candidates = soup.find_all(string=lambda s: isinstance(s, str) and "sponsored" in s.lower()) seen_blocks = set() for node in candidates: block = node # Climb to a container block for _ in range(3): if block and block.parent: block = block.parent if not block or id(block) in seen_blocks: continue seen_blocks.add(id(block)) advertiser = "" adv_a = block.find("a") if adv_a and adv_a.text: advertiser = adv_a.text.strip() text_parts = [] body = block.find(class_="body") if body and body.text: text_parts.append(body.text.strip()) else: text_parts.append(block.get_text(" ", strip=True)) ad_text = " ".join(text_parts)[:5000] link = "" a_tag = block.find("a", href=True) if a_tag: link = a_tag["href"] ads.append( AdRecord( source="html_sample", advertiser=advertiser, ad_text=ad_text, ad_link=link, media_urls=[], timestamp=time.time(), ) ) return ads # ----------------------------- # Selenium scraping (Meta Ad Library) # ----------------------------- def _build_chrome(headless: bool = True) -> webdriver.Chrome: opts = Options() if headless: # "new" headless is more stable in recent Chrome opts.add_argument("--headless=new") opts.add_argument("--no-sandbox") opts.add_argument("--disable-dev-shm-usage") opts.add_argument("--window-size=1600,1200") # 1) Try system ChromeDriver first (recommended for CI/sandboxes without ssl) try: return webdriver.Chrome(options=opts) except Exception as e1: # 2) Fallback to webdriver_manager (requires internet & ssl). Do inside try. try: from webdriver_manager.chrome import ChromeDriverManager # type: ignore return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts) except Exception as e2: raise RuntimeError( "Failed to start ChromeDriver. Ensure Chrome/Chromium + ChromeDriver are installed " "and on PATH, or run in an environment with internet/SSL for webdriver_manager.\n" f"System driver error: {e1}\nManager error: {e2}" ) def scrape_meta_ad_library( keyword: str, country: str = "IN", max_ads: int = 20, headless: bool = True, scroll_rounds: int = 8, per_scroll_pause: float = 2.5, ) -> List[AdRecord]: """Scrape Facebook Ad Library search results for a keyword. NOTE: Selectors on facebook.com change frequently; this is best-effort and may require updates. Works best when you're logged in and have accepted cookies. """ driver = _build_chrome(headless=headless) try: base = ( "https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=" f"{country}&q={quote_plus(keyword)}&search_type=keyword" ) driver.get(base) # Give time for cookie banners or initial JS time.sleep(4) # Try to accept cookies if a button exists try: WebDriverWait(driver, 5).until( EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Allow')] | //button[contains(., 'Accept')]")) ).click() time.sleep(2) except Exception: pass # Scroll to load more results last_height = driver.execute_script("return document.body.scrollHeight") for _ in range(scroll_rounds): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(per_scroll_pause) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height cards = [] # Heuristic selectors (may require tweaks over time) selectors = [ "div[role='article']", # generic card "div.x1lliihq.x1n2onr6", # fb new class soup (example) "div._99s5", # legacy ad card ] seen = set() for sel in selectors: cards.extend(driver.find_elements(By.CSS_SELECTOR, sel)) records: List[AdRecord] = [] for el in cards: try: text = el.text.strip() if not text: continue # Require some signal that it's an ad if "Sponsored" not in text and "Ad details" not in text and "Why am I seeing this ad" not in text: continue advertiser = "" try: adv = el.find_element(By.XPATH, ".//a[starts-with(@href, 'https://www.facebook.com/')]") advertiser = adv.text.strip() except Exception: pass ad_link = "" try: link = el.find_element(By.XPATH, ".//a[@href and contains(@href, 'ads/library')]") ad_link = link.get_attribute("href") except Exception: # fallback: first link try: link = el.find_element(By.XPATH, ".//a[@href]") ad_link = link.get_attribute("href") except Exception: pass key = (advertiser, ad_link, hash(text)) if key in seen: continue seen.add(key) records.append( AdRecord( source="facebook_ad_library", advertiser=advertiser, ad_text=text[:5000], ad_link=ad_link, media_urls=[], timestamp=time.time(), ) ) if len(records) >= max_ads: break except Exception: continue return records finally: driver.quit() # ----------------------------- # CSV/DF helpers # ----------------------------- def records_to_dataframe(records: List[AdRecord]) -> pd.DataFrame: return pd.DataFrame([asdict(r) for r in records]) def save_records_csv(records: List[AdRecord], path: str = "ads_results.csv") -> str: df = records_to_dataframe(records) df.to_csv(path, index=False) return path # ----------------------------- # Optional Gradio UI (lazy import to avoid ssl at import time) # ----------------------------- def launch_gradio_ui(): try: import gradio as gr # type: ignore except Exception as e: # Includes ModuleNotFoundError: ssl print( "[WARN] Gradio could not be imported (likely due to missing ssl in this environment).\n" " You can still use the CLI: python ad_scraper.py --keyword 'your term'\n" f" Import error: {e}" ) return def _scrape(keyword: str, country: str, max_ads: int, headless: bool): if not keyword.strip(): return pd.DataFrame(), None try: records = scrape_meta_ad_library(keyword=keyword.strip(), country=country, max_ads=max_ads, headless=headless) except Exception as e: # Show error in a friendly way err_df = pd.DataFrame([[str(e)]], columns=["Error"]) return err_df, None if not records: return pd.DataFrame(columns=[f"No results for '{keyword}'"]), None csv_path = save_records_csv(records) return records_to_dataframe(records), csv_path with gr.Blocks() as demo: gr.Markdown("# 📢 Meta Ad Library Scraper (Selenium)\nEnter a keyword to fetch matching ads.") with gr.Row(): kw = gr.Textbox(label="Keyword", value="GenZ skin care brand") with gr.Row(): country = gr.Dropdown(["IN","US","GB","CA","AU"], value="IN", label="Country") max_ads = gr.Slider(1, 100, value=20, step=1, label="Max Ads") headless = gr.Checkbox(value=True, label="Headless browser") btn = gr.Button("Scrape") out_df = gr.Dataframe(label="Results", interactive=False) out_file = gr.File(label="Download CSV") btn.click(_scrape, inputs=[kw, country, max_ads, headless], outputs=[out_df, out_file]) demo.launch() # ----------------------------- # CLI Interface # ----------------------------- def main_cli(argv: List[str]): import argparse p = argparse.ArgumentParser(description="Meta Ad Library scraper with Selenium (Gradio optional)") p.add_argument("--keyword", "-k", type=str, default="GenZ skin care brand", help="Search keyword") p.add_argument("--country", "-c", type=str, default="IN", help="Country code (e.g., IN, US)") p.add_argument("--max-ads", type=int, default=20, help="Max ads to collect") p.add_argument("--no-headless", action="store_true", help="Run browser with a window") p.add_argument("--gradio", action="store_true", help="Launch Gradio UI (requires ssl)") p.add_argument("--test", action="store_true", help="Run unit tests for HTML parsing") args = p.parse_args(argv) if args.gradio: launch_gradio_ui() return if args.test: run_unit_tests() return print(f"[INFO] Scraping Meta Ad Library for keyword='{args.keyword}' in country='{args.country}'...") try: records = scrape_meta_ad_library( keyword=args.keyword, country=args.country, max_ads=args.max_ads, headless=not args.no_headless, ) except Exception as e: print("[ERROR] Scrape failed:", e) print("Tip: Ensure Chrome + ChromeDriver are installed and on PATH, or re-run with --gradio in an env that has ssl.") return if not records: print("[INFO] No ads found.") return csv_path = save_records_csv(records) print(f"[INFO] Saved {len(records)} ads to {csv_path}") # ----------------------------- # Tests (no network/browser needed) # ----------------------------- def run_unit_tests(): print("[TEST] Running HTML parsing tests...") ads = extract_ads_from_html(SAMPLE_AD_HTML) assert len(ads) >= 2, "Expected at least 2 ads from sample HTML" assert any("GenZ serum" in a.ad_text for a in ads), "Should capture sample ad body text" assert any("SomeBrand" in a.advertiser for a in ads), "Should capture advertiser name" print("[TEST] OK — basic HTML extraction works.") if __name__ == "__main__": main_cli(sys.argv[1:])