selenium-screenshot-gradio / main_script.py
niharika17032001's picture
Create app.py
5ca8483
import json
import os
import time
from typing import List, Dict
# Import functions from your separate files
from crawler import crawl_pagalgana_site, load_crawl_state, save_crawl_state
from metadata_extractor import extract_song_metadata
def main():
images=[]
# --- Configuration ---
BASE_URL = "https://pagalgana.com/12-baje-le-chalau-blender-2025-raushan-rohi-bhojp-uuw.html"
MAX_CRAWL_DEPTH = 10 # Adjust this for how deep you want to crawl
CRAWL_STATE_FILE = "bollywood_crawl_state.json"
SONG_PAGES_FILE = "bollywood_song_pages.json" # Output from crawler
METADATA_OUTPUT_FILE = "bollywood_song_metadata.json" # Final output with detailed metadata
CRAWLER_SAVE_INTERVAL = 10 # Save crawler state every X pages
METADATA_SAVE_INTERVAL = 50 # Save metadata periodically every X songs extracted
print("Starting Pagalgana Web Scraper and Metadata Extractor.")
# --- Phase 1: Crawl the site to find song page URLs ---
print("\n## Phase 1: Discovering Song Page URLs ##")
# This function will handle loading/saving its own state
discovered_song_urls,images = crawl_pagalgana_site(
base_url=BASE_URL,
song_pages_json_file=SONG_PAGES_FILE,
state_filename=CRAWL_STATE_FILE,
max_crawl_depth=MAX_CRAWL_DEPTH,
save_interval=CRAWLER_SAVE_INTERVAL,
images=images
)
print(f"\nPhase 1 Complete. Found {len(discovered_song_urls)} unique song page URLs.")
# --- Phase 2: Extract metadata from discovered song URLs ---
print("\n## Phase 2: Extracting Metadata from Song Pages ##")
# Load previously extracted metadata to enable resuming this phase
# We use a dummy state_filename for this load to just get the metadata list
_, _, _, existing_metadata = load_crawl_state(
state_filename="dummy_state_for_metadata_load.json", # This specific file won't be used by crawler
song_pages_json_file=SONG_PAGES_FILE, # This is loaded by the crawler
metadata_json_file=METADATA_OUTPUT_FILE # This is the file we care about loading here
)
# Create a set of URLs for which we already have metadata
processed_metadata_urls = {entry.get("URL") for entry in existing_metadata if
isinstance(entry, dict) and "URL" in entry}
metadata_extracted_count = 0
new_metadata_entries: List[Dict] = [] # To store new entries from this run
# Iterate through each discovered song URL
for url in discovered_song_urls:
if url in processed_metadata_urls:
print(f" Metadata for {url} already extracted. Skipping.")
continue
metadata = extract_song_metadata(url)
new_metadata_entries.append(metadata)
metadata_extracted_count += 1
# Add the URL to our tracking set to avoid duplicates in this run
processed_metadata_urls.add(url)
# Save metadata periodically
if metadata_extracted_count % METADATA_SAVE_INTERVAL == 0:
# Combine existing and new metadata for periodic save
combined_metadata = existing_metadata + new_metadata_entries
try:
with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(combined_metadata, f, indent=4, ensure_ascii=False)
print(f" --- Saved {len(combined_metadata)} metadata entries to '{METADATA_OUTPUT_FILE}'. ---")
except IOError as e:
print(f" Error saving metadata periodically: {e}")
time.sleep(0.5) # Be kind to the server, small delay between fetches
# Final save of all metadata
final_metadata = existing_metadata + new_metadata_entries
try:
with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(final_metadata, f, indent=4, ensure_ascii=False)
except IOError as e:
print(f"Error saving final metadata to '{METADATA_OUTPUT_FILE}': {e}")
print(f"\nPhase 2 Complete. Extracted metadata for {len(new_metadata_entries)} new song pages.")
print(f"Total {len(final_metadata)} unique song metadata entries saved to '{METADATA_OUTPUT_FILE}'.")
print("\nScraping process finished.")
if __name__ == "__main__":
main()