import json import os import time from typing import List, Dict # Import functions from your separate files from crawler import crawl_pagalgana_site, load_crawl_state, save_crawl_state from metadata_extractor import extract_song_metadata def main(): images=[] # --- Configuration --- BASE_URL = "https://pagalgana.com/12-baje-le-chalau-blender-2025-raushan-rohi-bhojp-uuw.html" MAX_CRAWL_DEPTH = 10 # Adjust this for how deep you want to crawl CRAWL_STATE_FILE = "bollywood_crawl_state.json" SONG_PAGES_FILE = "bollywood_song_pages.json" # Output from crawler METADATA_OUTPUT_FILE = "bollywood_song_metadata.json" # Final output with detailed metadata CRAWLER_SAVE_INTERVAL = 10 # Save crawler state every X pages METADATA_SAVE_INTERVAL = 50 # Save metadata periodically every X songs extracted print("Starting Pagalgana Web Scraper and Metadata Extractor.") # --- Phase 1: Crawl the site to find song page URLs --- print("\n## Phase 1: Discovering Song Page URLs ##") # This function will handle loading/saving its own state discovered_song_urls,images = crawl_pagalgana_site( base_url=BASE_URL, song_pages_json_file=SONG_PAGES_FILE, state_filename=CRAWL_STATE_FILE, max_crawl_depth=MAX_CRAWL_DEPTH, save_interval=CRAWLER_SAVE_INTERVAL, images=images ) print(f"\nPhase 1 Complete. Found {len(discovered_song_urls)} unique song page URLs.") # --- Phase 2: Extract metadata from discovered song URLs --- print("\n## Phase 2: Extracting Metadata from Song Pages ##") # Load previously extracted metadata to enable resuming this phase # We use a dummy state_filename for this load to just get the metadata list _, _, _, existing_metadata = load_crawl_state( state_filename="dummy_state_for_metadata_load.json", # This specific file won't be used by crawler song_pages_json_file=SONG_PAGES_FILE, # This is loaded by the crawler metadata_json_file=METADATA_OUTPUT_FILE # This is the file we care about loading here ) # Create a set of URLs for which we already have metadata processed_metadata_urls = {entry.get("URL") for entry in existing_metadata if isinstance(entry, dict) and "URL" in entry} metadata_extracted_count = 0 new_metadata_entries: List[Dict] = [] # To store new entries from this run # Iterate through each discovered song URL for url in discovered_song_urls: if url in processed_metadata_urls: print(f" Metadata for {url} already extracted. Skipping.") continue metadata = extract_song_metadata(url) new_metadata_entries.append(metadata) metadata_extracted_count += 1 # Add the URL to our tracking set to avoid duplicates in this run processed_metadata_urls.add(url) # Save metadata periodically if metadata_extracted_count % METADATA_SAVE_INTERVAL == 0: # Combine existing and new metadata for periodic save combined_metadata = existing_metadata + new_metadata_entries try: with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(combined_metadata, f, indent=4, ensure_ascii=False) print(f" --- Saved {len(combined_metadata)} metadata entries to '{METADATA_OUTPUT_FILE}'. ---") except IOError as e: print(f" Error saving metadata periodically: {e}") time.sleep(0.5) # Be kind to the server, small delay between fetches # Final save of all metadata final_metadata = existing_metadata + new_metadata_entries try: with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(final_metadata, f, indent=4, ensure_ascii=False) except IOError as e: print(f"Error saving final metadata to '{METADATA_OUTPUT_FILE}': {e}") print(f"\nPhase 2 Complete. Extracted metadata for {len(new_metadata_entries)} new song pages.") print(f"Total {len(final_metadata)} unique song metadata entries saved to '{METADATA_OUTPUT_FILE}'.") print("\nScraping process finished.") if __name__ == "__main__": main()