Spaces:
Running
Running
File size: 5,135 Bytes
413a0e4 a99d8a1 413a0e4 a99d8a1 413a0e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def get_main_folder_name(url):
path = urlparse(url).path.strip("/") # Extract and clean the path
parts = path.split("/") # Split the path by "/"
# Ensure "read-scan" exists and get the next part as the main folder
if "read-scan" in parts:
index = parts.index("read-scan") # Find "read-scan" position
if index + 1 < len(parts): # Ensure there's a next part
return parts[index + 1] # Main folder name
return None # Return None if not found
def get_base_url(url):
"""Extracts the base url up to 'chapter-' from the given url."""
base = url.split("chapter-")[0] # Get everything before 'chapter-'
return base + "chapter-" # Ensure 'chapter-' is included at the end
def get_image_links(url):
"""Fetches all image urls from a given chapter page."""
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
print(f"Failed to fetch page {url}: HTTP {response.status_code}")
return []
soup = BeautifulSoup(response.text, "html.parser")
images = soup.find_all("img", class_="wp-manga-chapter-img")
return [img["src"] for img in images if "src" in img.attrs]
def get_chapter_name(url):
"""Extracts and formats the chapter name from the url."""
parts = url.rstrip("/").split("/")
chapter_number = parts[-1].replace("chapter-", "")
return f"chapter-{int(chapter_number):03d}" # Format as chapter-001, chapter-002, etc.
def download_images(image_links, folder_name,main_folder):
"""Downloads all images into the designated chapter folder."""
chapter_folder = os.path.join(main_folder, folder_name)
os.makedirs(chapter_folder, exist_ok=True)
for idx, img_url in enumerate(image_links):
try:
response = requests.get(img_url, stream=True)
if response.status_code == 200:
img_path = os.path.join(chapter_folder, f"{idx + 1}.jpg")
with open(img_path, "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {img_path}")
else:
print(f"Failed to download {img_url}: HTTP {response.status_code}")
except Exception as e:
print(f"Error downloading {img_url}: {e}")
def get_latest_chapter(base_url):
"""Fetches the latest available chapter number from the manga's main page."""
response = requests.get(base_url[:-9], headers=HEADERS) # Remove "chapter-" part to get the base page
if response.status_code != 200:
print("Failed to fetch the main manga page.")
return None
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
chapter_numbers = []
for link in links:
if "chapter-" in link["href"]:
try:
chapter_num = int(link["href"].split("chapter-")[-1].rstrip("/"))
chapter_numbers.append(chapter_num)
except ValueError:
continue
return max(chapter_numbers) if chapter_numbers else None
def temp_main(chapter_url, main_folder):
"""Downloads an entire manga chapter."""
chapter_name = get_chapter_name(chapter_url)
image_links = get_image_links(chapter_url)
if image_links:
download_images(image_links, chapter_name, main_folder)
print(f"All images saved in folder: {chapter_name}")
else:
print(f"No images found for {chapter_name}.")
def main(url):
# Main folder to store all chapters
main_folder = get_main_folder_name(url)
base_url = get_base_url(url)
os.makedirs(main_folder, exist_ok=True)
latest_chapter = get_latest_chapter(base_url)
if latest_chapter is None:
print("Could not determine the latest chapter.")
exit()
# Track consecutive empty chapters
empty_chapter_count = 0
max_empty_chapters = 5 # Stop after 5 empty chapters
#
# for index in range(0,10):
for index in range(0, latest_chapter + 1):
chapter_url = f"{base_url}{index}/"
print(f"Downloading {chapter_url}...")
image_links = get_image_links(chapter_url)
if not image_links:
print(f"No images found for chapter-{index}.")
empty_chapter_count += 1
if empty_chapter_count >= max_empty_chapters:
print("No images found in the last 5 chapters. Stopping download.")
break
else:
empty_chapter_count = 0 # Reset if a chapter contains images
temp_main(chapter_url, main_folder)
print(f"Chapter {index} downloaded!")
return main_folder
if __name__ == "__main__":
url="https://mangadistrict.com/read-scan/boarding-diary-uncensored-fan-edition/v1-high-quality-optimized-for-moderate-data-usage/chapter-1"
# url="hi"
main(url)
|