File size: 4,240 Bytes
5ca8483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import os
import time
from typing import List, Dict

# Import functions from your separate files
from crawler import crawl_pagalgana_site, load_crawl_state, save_crawl_state
from metadata_extractor import extract_song_metadata


def main():
    images=[]
    # --- Configuration ---
    BASE_URL = "https://pagalgana.com/12-baje-le-chalau-blender-2025-raushan-rohi-bhojp-uuw.html"
    MAX_CRAWL_DEPTH = 10  # Adjust this for how deep you want to crawl
    CRAWL_STATE_FILE = "bollywood_crawl_state.json"
    SONG_PAGES_FILE = "bollywood_song_pages.json"  # Output from crawler
    METADATA_OUTPUT_FILE = "bollywood_song_metadata.json"  # Final output with detailed metadata
    CRAWLER_SAVE_INTERVAL = 10  # Save crawler state every X pages
    METADATA_SAVE_INTERVAL = 50  # Save metadata periodically every X songs extracted

    print("Starting Pagalgana Web Scraper and Metadata Extractor.")

    # --- Phase 1: Crawl the site to find song page URLs ---
    print("\n## Phase 1: Discovering Song Page URLs ##")
    # This function will handle loading/saving its own state
    discovered_song_urls,images = crawl_pagalgana_site(
        base_url=BASE_URL,
        song_pages_json_file=SONG_PAGES_FILE,
        state_filename=CRAWL_STATE_FILE,
        max_crawl_depth=MAX_CRAWL_DEPTH,
        save_interval=CRAWLER_SAVE_INTERVAL,
        images=images
    )
    print(f"\nPhase 1 Complete. Found {len(discovered_song_urls)} unique song page URLs.")

    # --- Phase 2: Extract metadata from discovered song URLs ---
    print("\n## Phase 2: Extracting Metadata from Song Pages ##")

    # Load previously extracted metadata to enable resuming this phase
    # We use a dummy state_filename for this load to just get the metadata list
    _, _, _, existing_metadata = load_crawl_state(
        state_filename="dummy_state_for_metadata_load.json",  # This specific file won't be used by crawler
        song_pages_json_file=SONG_PAGES_FILE,  # This is loaded by the crawler
        metadata_json_file=METADATA_OUTPUT_FILE  # This is the file we care about loading here
    )

    # Create a set of URLs for which we already have metadata
    processed_metadata_urls = {entry.get("URL") for entry in existing_metadata if
                               isinstance(entry, dict) and "URL" in entry}

    metadata_extracted_count = 0
    new_metadata_entries: List[Dict] = []  # To store new entries from this run

    # Iterate through each discovered song URL
    for url in discovered_song_urls:
        if url in processed_metadata_urls:
            print(f"  Metadata for {url} already extracted. Skipping.")
            continue

        metadata = extract_song_metadata(url)
        new_metadata_entries.append(metadata)
        metadata_extracted_count += 1

        # Add the URL to our tracking set to avoid duplicates in this run
        processed_metadata_urls.add(url)

        # Save metadata periodically
        if metadata_extracted_count % METADATA_SAVE_INTERVAL == 0:
            # Combine existing and new metadata for periodic save
            combined_metadata = existing_metadata + new_metadata_entries
            try:
                with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f:
                    json.dump(combined_metadata, f, indent=4, ensure_ascii=False)
                print(f"  --- Saved {len(combined_metadata)} metadata entries to '{METADATA_OUTPUT_FILE}'. ---")
            except IOError as e:
                print(f"  Error saving metadata periodically: {e}")

        time.sleep(0.5)  # Be kind to the server, small delay between fetches

    # Final save of all metadata
    final_metadata = existing_metadata + new_metadata_entries
    try:
        with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(final_metadata, f, indent=4, ensure_ascii=False)
    except IOError as e:
        print(f"Error saving final metadata to '{METADATA_OUTPUT_FILE}': {e}")

    print(f"\nPhase 2 Complete. Extracted metadata for {len(new_metadata_entries)} new song pages.")
    print(f"Total {len(final_metadata)} unique song metadata entries saved to '{METADATA_OUTPUT_FILE}'.")
    print("\nScraping process finished.")


if __name__ == "__main__":
    main()