Spaces:

amit0987
/

selenium-screenshot-gradio

Running

File size: 4,027 Bytes

5ca8483

import requests
from lxml import html
from bs4 import BeautifulSoup
import json
import re

def fetch_html_tree_requests(url: str) -> tuple:
    """Fetches HTML using requests and returns lxml tree and raw HTML."""
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        return html.fromstring(response.content), response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url} with requests: {e}")
        return None, None

def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str:
    """Extracts the tbody HTML string from an lxml tree."""
    result = tree.xpath(xpath)
    if not result:
        return None
    return html.tostring(result[0], encoding='unicode')

def extract_thumbnail(tree: html.HtmlElement) -> str:
    """Extracts the thumbnail URL from JSON-LD script tags."""
    scripts = tree.xpath("//script[@type='application/ld+json']/text()")
    for script in scripts:
        try:
            json_data = json.loads(script.strip())
            if isinstance(json_data, dict) and "image" in json_data:
                return json_data["image"]
        except json.JSONDecodeError:
            continue
    return None

def extract_audio_url(html_text: str) -> str:
    """Extracts the MP3 audio URL using regex from raw HTML."""
    match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text)
    return match.group(1) if match else None

def tbody_to_json(html_tbody: str) -> dict:
    """Parses tbody HTML using BeautifulSoup and converts to a dictionary."""
    if not html_tbody:
        return {}
    soup = BeautifulSoup(html_tbody, "html.parser")
    data = {}

    for tr in soup.find_all("tr", class_="tr"):
        tds = tr.find_all("td")
        if len(tds) < 2:
            continue

        key = tds[0].get_text(strip=True).rstrip(":")
        value_cell = tds[1]

        if key == "Rating":
            stars = value_cell.find_all("span")
            if stars:
                stars_str = ''.join(star.get_text(strip=True) for star in stars)
                data[key] = {
                    "stars": stars_str,
                    "out_of": 5,
                    "value": stars_str.count("★") + 0.5 * stars_str.count("☆")
                }
            continue

        value = value_cell.get_text(" ", strip=True)
        data[key] = value

    return data

def extract_song_metadata(url: str) -> dict:
    """Fetches a song page and extracts all relevant metadata."""
    print(f"  Attempting to extract metadata from: {url}")
    tree, html_text = fetch_html_tree_requests(url)
    if tree is None:
        return {"URL": url, "error": "Failed to fetch page with requests or network issue."}

    metadata = {"URL": url}

    try:
        tbody_html = extract_tbody_html(tree)
        if tbody_html:
            metadata.update(tbody_to_json(tbody_html))
        else:
            metadata["tbody_data_present"] = False

        thumbnail_url = extract_thumbnail(tree)
        if thumbnail_url:
            metadata["Thumbnail"] = thumbnail_url

        audio_url = extract_audio_url(html_text)
        if audio_url:
            metadata["Play Online"] = audio_url
        else:
            metadata["Play Online"] = None

    except Exception as e:
        metadata["error_extracting_metadata"] = str(e)
        print(f"  Error extracting metadata for {url}: {e}")

    return metadata

# This __name__ block is for testing `metadata_extractor.py` independently
if __name__ == "__main__":
    # Example usage for standalone testing
    test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html"
    metadata = extract_song_metadata(test_url)
    print(json.dumps(metadata, indent=4, ensure_ascii=False))