import requests
from lxml import html
from bs4 import BeautifulSoup
import json
import re
def fetch_html_tree_requests(url: str) -> tuple:
"""Fetches HTML using requests and returns lxml tree and raw HTML."""
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
return html.fromstring(response.content), response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching {url} with requests: {e}")
return None, None
def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str:
"""Extracts the tbody HTML string from an lxml tree."""
result = tree.xpath(xpath)
if not result:
return None
return html.tostring(result[0], encoding='unicode')
def extract_thumbnail(tree: html.HtmlElement) -> str:
"""Extracts the thumbnail URL from JSON-LD script tags."""
scripts = tree.xpath("//script[@type='application/ld+json']/text()")
for script in scripts:
try:
json_data = json.loads(script.strip())
if isinstance(json_data, dict) and "image" in json_data:
return json_data["image"]
except json.JSONDecodeError:
continue
return None
def extract_audio_url(html_text: str) -> str:
"""Extracts the MP3 audio URL using regex from raw HTML."""
match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text)
return match.group(1) if match else None
def tbody_to_json(html_tbody: str) -> dict:
"""Parses tbody HTML using BeautifulSoup and converts to a dictionary."""
if not html_tbody:
return {}
soup = BeautifulSoup(html_tbody, "html.parser")
data = {}
for tr in soup.find_all("tr", class_="tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
key = tds[0].get_text(strip=True).rstrip(":")
value_cell = tds[1]
if key == "Rating":
stars = value_cell.find_all("span")
if stars:
stars_str = ''.join(star.get_text(strip=True) for star in stars)
data[key] = {
"stars": stars_str,
"out_of": 5,
"value": stars_str.count("★") + 0.5 * stars_str.count("☆")
}
continue
value = value_cell.get_text(" ", strip=True)
data[key] = value
return data
def extract_song_metadata(url: str) -> dict:
"""Fetches a song page and extracts all relevant metadata."""
print(f" Attempting to extract metadata from: {url}")
tree, html_text = fetch_html_tree_requests(url)
if tree is None:
return {"URL": url, "error": "Failed to fetch page with requests or network issue."}
metadata = {"URL": url}
try:
tbody_html = extract_tbody_html(tree)
if tbody_html:
metadata.update(tbody_to_json(tbody_html))
else:
metadata["tbody_data_present"] = False
thumbnail_url = extract_thumbnail(tree)
if thumbnail_url:
metadata["Thumbnail"] = thumbnail_url
audio_url = extract_audio_url(html_text)
if audio_url:
metadata["Play Online"] = audio_url
else:
metadata["Play Online"] = None
except Exception as e:
metadata["error_extracting_metadata"] = str(e)
print(f" Error extracting metadata for {url}: {e}")
return metadata
# This __name__ block is for testing `metadata_extractor.py` independently
if __name__ == "__main__":
# Example usage for standalone testing
test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html"
metadata = extract_song_metadata(test_url)
print(json.dumps(metadata, indent=4, ensure_ascii=False))