import requests from lxml import html from bs4 import BeautifulSoup import json import re def fetch_html_tree_requests(url: str) -> tuple: """Fetches HTML using requests and returns lxml tree and raw HTML.""" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) return html.fromstring(response.content), response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url} with requests: {e}") return None, None def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str: """Extracts the tbody HTML string from an lxml tree.""" result = tree.xpath(xpath) if not result: return None return html.tostring(result[0], encoding='unicode') def extract_thumbnail(tree: html.HtmlElement) -> str: """Extracts the thumbnail URL from JSON-LD script tags.""" scripts = tree.xpath("//script[@type='application/ld+json']/text()") for script in scripts: try: json_data = json.loads(script.strip()) if isinstance(json_data, dict) and "image" in json_data: return json_data["image"] except json.JSONDecodeError: continue return None def extract_audio_url(html_text: str) -> str: """Extracts the MP3 audio URL using regex from raw HTML.""" match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text) return match.group(1) if match else None def tbody_to_json(html_tbody: str) -> dict: """Parses tbody HTML using BeautifulSoup and converts to a dictionary.""" if not html_tbody: return {} soup = BeautifulSoup(html_tbody, "html.parser") data = {} for tr in soup.find_all("tr", class_="tr"): tds = tr.find_all("td") if len(tds) < 2: continue key = tds[0].get_text(strip=True).rstrip(":") value_cell = tds[1] if key == "Rating": stars = value_cell.find_all("span") if stars: stars_str = ''.join(star.get_text(strip=True) for star in stars) data[key] = { "stars": stars_str, "out_of": 5, "value": stars_str.count("★") + 0.5 * stars_str.count("☆") } continue value = value_cell.get_text(" ", strip=True) data[key] = value return data def extract_song_metadata(url: str) -> dict: """Fetches a song page and extracts all relevant metadata.""" print(f" Attempting to extract metadata from: {url}") tree, html_text = fetch_html_tree_requests(url) if tree is None: return {"URL": url, "error": "Failed to fetch page with requests or network issue."} metadata = {"URL": url} try: tbody_html = extract_tbody_html(tree) if tbody_html: metadata.update(tbody_to_json(tbody_html)) else: metadata["tbody_data_present"] = False thumbnail_url = extract_thumbnail(tree) if thumbnail_url: metadata["Thumbnail"] = thumbnail_url audio_url = extract_audio_url(html_text) if audio_url: metadata["Play Online"] = audio_url else: metadata["Play Online"] = None except Exception as e: metadata["error_extracting_metadata"] = str(e) print(f" Error extracting metadata for {url}: {e}") return metadata # This __name__ block is for testing `metadata_extractor.py` independently if __name__ == "__main__": # Example usage for standalone testing test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html" metadata = extract_song_metadata(test_url) print(json.dumps(metadata, indent=4, ensure_ascii=False))