Spaces:
Running
Running
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
HEADERS = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
} | |
def get_main_folder_name(url): | |
path = urlparse(url).path.strip("/") # Extract and clean the path | |
parts = path.split("/") # Split the path by "/" | |
# Ensure "read-scan" exists and get the next part as the main folder | |
if "read-scan" in parts: | |
index = parts.index("read-scan") # Find "read-scan" position | |
if index + 1 < len(parts): # Ensure there's a next part | |
return parts[index + 1] # Main folder name | |
return None # Return None if not found | |
def get_base_url(url): | |
"""Extracts the base url up to 'chapter-' from the given url.""" | |
base = url.split("chapter-")[0] # Get everything before 'chapter-' | |
return base + "chapter-" # Ensure 'chapter-' is included at the end | |
def get_image_links(url): | |
"""Fetches all image urls from a given chapter page.""" | |
response = requests.get(url, headers=HEADERS) | |
if response.status_code != 200: | |
print(f"Failed to fetch page {url}: HTTP {response.status_code}") | |
return [] | |
soup = BeautifulSoup(response.text, "html.parser") | |
images = soup.find_all("img", class_="wp-manga-chapter-img") | |
return [img["src"] for img in images if "src" in img.attrs] | |
def get_chapter_name(url): | |
"""Extracts and formats the chapter name from the url.""" | |
parts = url.rstrip("/").split("/") | |
chapter_number = parts[-1].replace("chapter-", "") | |
return f"chapter-{int(chapter_number):03d}" # Format as chapter-001, chapter-002, etc. | |
def download_images(image_links, folder_name,main_folder): | |
"""Downloads all images into the designated chapter folder.""" | |
chapter_folder = os.path.join(main_folder, folder_name) | |
os.makedirs(chapter_folder, exist_ok=True) | |
for idx, img_url in enumerate(image_links): | |
try: | |
response = requests.get(img_url, stream=True) | |
if response.status_code == 200: | |
img_path = os.path.join(chapter_folder, f"{idx + 1}.jpg") | |
with open(img_path, "wb") as f: | |
for chunk in response.iter_content(1024): | |
f.write(chunk) | |
print(f"Downloaded: {img_path}") | |
else: | |
print(f"Failed to download {img_url}: HTTP {response.status_code}") | |
except Exception as e: | |
print(f"Error downloading {img_url}: {e}") | |
def get_latest_chapter(base_url): | |
"""Fetches the latest available chapter number from the manga's main page.""" | |
response = requests.get(base_url[:-9], headers=HEADERS) # Remove "chapter-" part to get the base page | |
if response.status_code != 200: | |
print("Failed to fetch the main manga page.") | |
return None | |
soup = BeautifulSoup(response.text, "html.parser") | |
links = soup.find_all("a", href=True) | |
chapter_numbers = [] | |
for link in links: | |
if "chapter-" in link["href"]: | |
try: | |
chapter_num = int(link["href"].split("chapter-")[-1].rstrip("/")) | |
chapter_numbers.append(chapter_num) | |
except ValueError: | |
continue | |
return max(chapter_numbers) if chapter_numbers else None | |
def temp_main(chapter_url, main_folder): | |
"""Downloads an entire manga chapter.""" | |
chapter_name = get_chapter_name(chapter_url) | |
image_links = get_image_links(chapter_url) | |
if image_links: | |
download_images(image_links, chapter_name, main_folder) | |
print(f"All images saved in folder: {chapter_name}") | |
else: | |
print(f"No images found for {chapter_name}.") | |
def main(url): | |
# Main folder to store all chapters | |
main_folder = get_main_folder_name(url) | |
base_url = get_base_url(url) | |
os.makedirs(main_folder, exist_ok=True) | |
latest_chapter = get_latest_chapter(base_url) | |
if latest_chapter is None: | |
print("Could not determine the latest chapter.") | |
exit() | |
# Track consecutive empty chapters | |
empty_chapter_count = 0 | |
max_empty_chapters = 5 # Stop after 5 empty chapters | |
# | |
# for index in range(0,10): | |
for index in range(0, latest_chapter + 1): | |
chapter_url = f"{base_url}{index}/" | |
print(f"Downloading {chapter_url}...") | |
image_links = get_image_links(chapter_url) | |
if not image_links: | |
print(f"No images found for chapter-{index}.") | |
empty_chapter_count += 1 | |
if empty_chapter_count >= max_empty_chapters: | |
print("No images found in the last 5 chapters. Stopping download.") | |
break | |
else: | |
empty_chapter_count = 0 # Reset if a chapter contains images | |
temp_main(chapter_url, main_folder) | |
print(f"Chapter {index} downloaded!") | |
return main_folder | |
if __name__ == "__main__": | |
url="https://mangadistrict.com/read-scan/boarding-diary-uncensored-fan-edition/v1-high-quality-optimized-for-moderate-data-usage/chapter-1" | |
# url="hi" | |
main(url) | |