Spaces:
Sleeping
Sleeping
import cloudscraper | |
from bs4 import BeautifulSoup | |
import time | |
BASE_URL = "https://www.lightreading.com" | |
AUTHOR_URL = f"{BASE_URL}/author/iain-morris" | |
TARGET_COUNT = 100 | |
DELAY = 1 # polite wait between requests | |
# Create a scraper that bypasses Cloudflare protection | |
scraper = cloudscraper.create_scraper() | |
def fetch_page(url): | |
"""Fetch and parse a page from the given URL.""" | |
resp = scraper.get(url) | |
resp.raise_for_status() | |
return BeautifulSoup(resp.text, "html.parser") | |
def extract_article_links(soup): | |
"""Extract valid article links from a BeautifulSoup object.""" | |
links = [] | |
# Use the correct selector based on the actual HTML structure | |
for a in soup.select('a.ListPreview-Title[data-testid="preview-default-title"]'): | |
href = a['href'] | |
if href.startswith("/author/"): | |
continue # skip author links | |
full_url = BASE_URL + href if href.startswith("/") else href | |
links.append(full_url) | |
return links | |
def scrape_latest_urls(): | |
"""Scrape up to TARGET_COUNT article URLs from paginated author pages.""" | |
urls, seen = [], set() | |
page_num = 1 | |
while len(urls) < TARGET_COUNT: | |
page_url = f"{AUTHOR_URL}?page={page_num}" | |
print(f"Fetching {page_url} …") | |
soup = fetch_page(page_url) | |
found = extract_article_links(soup) | |
if not found: | |
print("No more articles found; stopping.") | |
break | |
for u in found: | |
if u not in seen: | |
seen.add(u) | |
urls.append(u) | |
if len(urls) >= TARGET_COUNT: | |
break | |
page_num += 1 | |
time.sleep(DELAY) | |
return urls | |
if __name__ == "__main__": | |
urls = scrape_latest_urls() | |
print(f"\n✅ Collected {len(urls)} article URLs:\n") | |
for idx, url in enumerate(urls, 1): | |
print(f"{url}") | |