|
import requests |
|
from lxml import html |
|
from bs4 import BeautifulSoup |
|
import json |
|
import re |
|
|
|
def fetch_html_tree_requests(url: str) -> tuple: |
|
"""Fetches HTML using requests and returns lxml tree and raw HTML.""" |
|
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} |
|
try: |
|
response = requests.get(url, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
return html.fromstring(response.content), response.text |
|
except requests.exceptions.RequestException as e: |
|
print(f"Error fetching {url} with requests: {e}") |
|
return None, None |
|
|
|
def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str: |
|
"""Extracts the tbody HTML string from an lxml tree.""" |
|
result = tree.xpath(xpath) |
|
if not result: |
|
return None |
|
return html.tostring(result[0], encoding='unicode') |
|
|
|
def extract_thumbnail(tree: html.HtmlElement) -> str: |
|
"""Extracts the thumbnail URL from JSON-LD script tags.""" |
|
scripts = tree.xpath("//script[@type='application/ld+json']/text()") |
|
for script in scripts: |
|
try: |
|
json_data = json.loads(script.strip()) |
|
if isinstance(json_data, dict) and "image" in json_data: |
|
return json_data["image"] |
|
except json.JSONDecodeError: |
|
continue |
|
return None |
|
|
|
def extract_audio_url(html_text: str) -> str: |
|
"""Extracts the MP3 audio URL using regex from raw HTML.""" |
|
match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text) |
|
return match.group(1) if match else None |
|
|
|
def tbody_to_json(html_tbody: str) -> dict: |
|
"""Parses tbody HTML using BeautifulSoup and converts to a dictionary.""" |
|
if not html_tbody: |
|
return {} |
|
soup = BeautifulSoup(html_tbody, "html.parser") |
|
data = {} |
|
|
|
for tr in soup.find_all("tr", class_="tr"): |
|
tds = tr.find_all("td") |
|
if len(tds) < 2: |
|
continue |
|
|
|
key = tds[0].get_text(strip=True).rstrip(":") |
|
value_cell = tds[1] |
|
|
|
if key == "Rating": |
|
stars = value_cell.find_all("span") |
|
if stars: |
|
stars_str = ''.join(star.get_text(strip=True) for star in stars) |
|
data[key] = { |
|
"stars": stars_str, |
|
"out_of": 5, |
|
"value": stars_str.count("β
") + 0.5 * stars_str.count("β") |
|
} |
|
continue |
|
|
|
value = value_cell.get_text(" ", strip=True) |
|
data[key] = value |
|
|
|
return data |
|
|
|
def extract_song_metadata(url: str) -> dict: |
|
"""Fetches a song page and extracts all relevant metadata.""" |
|
print(f" Attempting to extract metadata from: {url}") |
|
tree, html_text = fetch_html_tree_requests(url) |
|
if tree is None: |
|
return {"URL": url, "error": "Failed to fetch page with requests or network issue."} |
|
|
|
metadata = {"URL": url} |
|
|
|
try: |
|
tbody_html = extract_tbody_html(tree) |
|
if tbody_html: |
|
metadata.update(tbody_to_json(tbody_html)) |
|
else: |
|
metadata["tbody_data_present"] = False |
|
|
|
thumbnail_url = extract_thumbnail(tree) |
|
if thumbnail_url: |
|
metadata["Thumbnail"] = thumbnail_url |
|
|
|
audio_url = extract_audio_url(html_text) |
|
if audio_url: |
|
metadata["Play Online"] = audio_url |
|
else: |
|
metadata["Play Online"] = None |
|
|
|
except Exception as e: |
|
metadata["error_extracting_metadata"] = str(e) |
|
print(f" Error extracting metadata for {url}: {e}") |
|
|
|
return metadata |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html" |
|
metadata = extract_song_metadata(test_url) |
|
print(json.dumps(metadata, indent=4, ensure_ascii=False)) |