Spaces:

amit0987
/

Manga_download

Running

App Files Files Community

Manga_download / download_manga.py

niharika17032001

Create app.py

a99d8a1 6 months ago

raw

history blame contribute delete

5.14 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	}


	def get_main_folder_name(url):
	path = urlparse(url).path.strip("/") # Extract and clean the path
	parts = path.split("/") # Split the path by "/"

	# Ensure "read-scan" exists and get the next part as the main folder
	if "read-scan" in parts:
	index = parts.index("read-scan") # Find "read-scan" position
	if index + 1 < len(parts): # Ensure there's a next part
	return parts[index + 1] # Main folder name

	return None # Return None if not found


	def get_base_url(url):
	"""Extracts the base url up to 'chapter-' from the given url."""
	base = url.split("chapter-")[0] # Get everything before 'chapter-'
	return base + "chapter-" # Ensure 'chapter-' is included at the end



	def get_image_links(url):
	"""Fetches all image urls from a given chapter page."""
	response = requests.get(url, headers=HEADERS)
	if response.status_code != 200:
	print(f"Failed to fetch page {url}: HTTP {response.status_code}")
	return []

	soup = BeautifulSoup(response.text, "html.parser")
	images = soup.find_all("img", class_="wp-manga-chapter-img")
	return [img["src"] for img in images if "src" in img.attrs]

	def get_chapter_name(url):
	"""Extracts and formats the chapter name from the url."""
	parts = url.rstrip("/").split("/")
	chapter_number = parts[-1].replace("chapter-", "")
	return f"chapter-{int(chapter_number):03d}" # Format as chapter-001, chapter-002, etc.

	def download_images(image_links, folder_name,main_folder):
	"""Downloads all images into the designated chapter folder."""
	chapter_folder = os.path.join(main_folder, folder_name)
	os.makedirs(chapter_folder, exist_ok=True)

	for idx, img_url in enumerate(image_links):
	try:
	response = requests.get(img_url, stream=True)
	if response.status_code == 200:
	img_path = os.path.join(chapter_folder, f"{idx + 1}.jpg")
	with open(img_path, "wb") as f:
	for chunk in response.iter_content(1024):
	f.write(chunk)
	print(f"Downloaded: {img_path}")
	else:
	print(f"Failed to download {img_url}: HTTP {response.status_code}")
	except Exception as e:
	print(f"Error downloading {img_url}: {e}")

	def get_latest_chapter(base_url):
	"""Fetches the latest available chapter number from the manga's main page."""
	response = requests.get(base_url[:-9], headers=HEADERS) # Remove "chapter-" part to get the base page
	if response.status_code != 200:
	print("Failed to fetch the main manga page.")
	return None

	soup = BeautifulSoup(response.text, "html.parser")
	links = soup.find_all("a", href=True)

	chapter_numbers = []
	for link in links:
	if "chapter-" in link["href"]:
	try:
	chapter_num = int(link["href"].split("chapter-")[-1].rstrip("/"))
	chapter_numbers.append(chapter_num)
	except ValueError:
	continue

	return max(chapter_numbers) if chapter_numbers else None

	def temp_main(chapter_url, main_folder):
	"""Downloads an entire manga chapter."""
	chapter_name = get_chapter_name(chapter_url)
	image_links = get_image_links(chapter_url)

	if image_links:
	download_images(image_links, chapter_name, main_folder)
	print(f"All images saved in folder: {chapter_name}")
	else:
	print(f"No images found for {chapter_name}.")


	def main(url):
	# Main folder to store all chapters
	main_folder = get_main_folder_name(url)

	base_url = get_base_url(url)

	os.makedirs(main_folder, exist_ok=True)

	latest_chapter = get_latest_chapter(base_url)
	if latest_chapter is None:
	print("Could not determine the latest chapter.")
	exit()

	# Track consecutive empty chapters
	empty_chapter_count = 0
	max_empty_chapters = 5 # Stop after 5 empty chapters
	#
	# for index in range(0,10):

	for index in range(0, latest_chapter + 1):

	chapter_url = f"{base_url}{index}/"
	print(f"Downloading {chapter_url}...")

	image_links = get_image_links(chapter_url)
	if not image_links:
	print(f"No images found for chapter-{index}.")
	empty_chapter_count += 1
	if empty_chapter_count >= max_empty_chapters:
	print("No images found in the last 5 chapters. Stopping download.")
	break
	else:
	empty_chapter_count = 0 # Reset if a chapter contains images
	temp_main(chapter_url, main_folder)
	print(f"Chapter {index} downloaded!")

	return main_folder


	if __name__ == "__main__":
	url="https://mangadistrict.com/read-scan/boarding-diary-uncensored-fan-edition/v1-high-quality-optimized-for-moderate-data-usage/chapter-1"
	# url="hi"
	main(url)