Spaces:
Runtime error
Runtime error
import time | |
from urllib.parse import quote, urljoin | |
from playwright.sync_api import sync_playwright | |
from youtube_utils import get_duration_in_seconds, parse_year_from_text | |
def build_search_url(course_name: str) -> str: | |
modified = f"{course_name} in English -Hindi -हिन्दी" | |
return ( | |
"https://www.youtube.com/results" | |
f"?search_query={quote(modified)}" | |
"&sp=EgIQAw%253D%253D" | |
) | |
def extract_playlist_view_count(playlist_page): | |
view_spans = playlist_page.locator( | |
"yt-content-metadata-view-model span.yt-core-attributed-string" | |
) | |
for i in range(view_spans.count()): | |
text = view_spans.nth(i).text_content().strip() | |
if "views" in text.lower(): | |
import re | |
m = re.search(r"(\d[\d,]*)", text) | |
if m: | |
return int(m.group(1).replace(",", "")) | |
return 0 | |
def extract_first_video_year(playlist_page): | |
stats = playlist_page.locator( | |
"yt-formatted-string#video-info span.style-scope.yt-formatted-string" | |
) | |
texts = [stats.nth(i).text_content().strip() for i in range(stats.count())] | |
return parse_year_from_text(texts) | |
def scrape_playlists(course_name: str, headless: bool = True) -> dict: | |
search_url = build_search_url(course_name) | |
base = "https://www.youtube.com" | |
out = {"search_url": search_url, "playlists": []} | |
with sync_playwright() as p: | |
browser = p.chromium.launch(headless=headless) | |
page = browser.new_page() | |
page.goto(search_url) | |
page.wait_for_load_state("networkidle") | |
#page_scroll | |
for _ in range(20): | |
page.keyboard.press("PageDown") | |
time.sleep(0.3) | |
cards = page.locator("yt-lockup-view-model.ytd-item-section-renderer.lockup") | |
total = cards.count() | |
print(f"▶️ Found {total} playlist cards") | |
for i in range(total): | |
card = cards.nth(i) | |
tloc = card.locator("h3 a.yt-lockup-metadata-view-model-wiz__title") | |
title = (tloc.get_attribute("title") or tloc.text_content() or "").strip() | |
href = tloc.get_attribute("href") or "" | |
url = urljoin(base, href) | |
badge = card.locator("div.badge-shape-wiz__text").first | |
raw = badge.text_content().strip() if badge.count() else "" | |
import re | |
m = re.search(r"(\d+)", raw.replace(",", "")) | |
count = int(m.group(1)) if m else 0 | |
if count > 25: | |
continue | |
view_link = card.locator("a.yt-core-attributed-string__link", has_text="View full playlist").first | |
if not view_link.count(): | |
continue | |
full_playlist_url = urljoin(base, view_link.get_attribute("href")) | |
playlist_page = browser.new_page() | |
playlist_page.goto(full_playlist_url) | |
playlist_page.wait_for_timeout(3000) | |
playlist_page.keyboard.press("PageDown") | |
time.sleep(1) | |
durations = playlist_page.locator( | |
"div.thumbnail-overlay-badge-shape.style-scope.ytd-thumbnail-overlay-time-status-renderer >> div.badge-shape-wiz__text" | |
) | |
long_video_found = False | |
for j in range(durations.count()): | |
text = durations.nth(j).text_content().strip() | |
if get_duration_in_seconds(text) > 1800: | |
long_video_found = True | |
break | |
if not long_video_found: | |
views = extract_playlist_view_count(playlist_page) | |
year = extract_first_video_year(playlist_page) | |
out["playlists"].append({ | |
"title": title, | |
"url": url, | |
"video_count": count, | |
"full_playlist_url": full_playlist_url, | |
"views": views, | |
"year": year | |
}) | |
playlist_page.close() | |
browser.close() | |
out["playlists"].sort(key=lambda x: (-x["views"], -x["year"])) | |
return out |