|
import json |
|
import os |
|
import time |
|
from typing import List, Dict |
|
|
|
|
|
from crawler import crawl_pagalgana_site, load_crawl_state, save_crawl_state |
|
from metadata_extractor import extract_song_metadata |
|
|
|
|
|
def main(): |
|
images=[] |
|
|
|
BASE_URL = "https://pagalgana.com/12-baje-le-chalau-blender-2025-raushan-rohi-bhojp-uuw.html" |
|
MAX_CRAWL_DEPTH = 10 |
|
CRAWL_STATE_FILE = "bollywood_crawl_state.json" |
|
SONG_PAGES_FILE = "bollywood_song_pages.json" |
|
METADATA_OUTPUT_FILE = "bollywood_song_metadata.json" |
|
CRAWLER_SAVE_INTERVAL = 10 |
|
METADATA_SAVE_INTERVAL = 50 |
|
|
|
print("Starting Pagalgana Web Scraper and Metadata Extractor.") |
|
|
|
|
|
print("\n## Phase 1: Discovering Song Page URLs ##") |
|
|
|
discovered_song_urls,images = crawl_pagalgana_site( |
|
base_url=BASE_URL, |
|
song_pages_json_file=SONG_PAGES_FILE, |
|
state_filename=CRAWL_STATE_FILE, |
|
max_crawl_depth=MAX_CRAWL_DEPTH, |
|
save_interval=CRAWLER_SAVE_INTERVAL, |
|
images=images |
|
) |
|
print(f"\nPhase 1 Complete. Found {len(discovered_song_urls)} unique song page URLs.") |
|
|
|
|
|
print("\n## Phase 2: Extracting Metadata from Song Pages ##") |
|
|
|
|
|
|
|
_, _, _, existing_metadata = load_crawl_state( |
|
state_filename="dummy_state_for_metadata_load.json", |
|
song_pages_json_file=SONG_PAGES_FILE, |
|
metadata_json_file=METADATA_OUTPUT_FILE |
|
) |
|
|
|
|
|
processed_metadata_urls = {entry.get("URL") for entry in existing_metadata if |
|
isinstance(entry, dict) and "URL" in entry} |
|
|
|
metadata_extracted_count = 0 |
|
new_metadata_entries: List[Dict] = [] |
|
|
|
|
|
for url in discovered_song_urls: |
|
if url in processed_metadata_urls: |
|
print(f" Metadata for {url} already extracted. Skipping.") |
|
continue |
|
|
|
metadata = extract_song_metadata(url) |
|
new_metadata_entries.append(metadata) |
|
metadata_extracted_count += 1 |
|
|
|
|
|
processed_metadata_urls.add(url) |
|
|
|
|
|
if metadata_extracted_count % METADATA_SAVE_INTERVAL == 0: |
|
|
|
combined_metadata = existing_metadata + new_metadata_entries |
|
try: |
|
with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f: |
|
json.dump(combined_metadata, f, indent=4, ensure_ascii=False) |
|
print(f" --- Saved {len(combined_metadata)} metadata entries to '{METADATA_OUTPUT_FILE}'. ---") |
|
except IOError as e: |
|
print(f" Error saving metadata periodically: {e}") |
|
|
|
time.sleep(0.5) |
|
|
|
|
|
final_metadata = existing_metadata + new_metadata_entries |
|
try: |
|
with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f: |
|
json.dump(final_metadata, f, indent=4, ensure_ascii=False) |
|
except IOError as e: |
|
print(f"Error saving final metadata to '{METADATA_OUTPUT_FILE}': {e}") |
|
|
|
print(f"\nPhase 2 Complete. Extracted metadata for {len(new_metadata_entries)} new song pages.") |
|
print(f"Total {len(final_metadata)} unique song metadata entries saved to '{METADATA_OUTPUT_FILE}'.") |
|
print("\nScraping process finished.") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |