save-web-as-zip / app.py
soiz1's picture
Update app.py
b72371b verified
import os
import shutil
import tempfile
import zipfile
import gradio as gr
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
# Fix for gradio_client issue
def patch_gradio_client():
file_path = "/usr/local/lib/python3.10/site-packages/gradio_client/utils.py"
try:
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
# Check if patch is already applied
if any("if not isinstance(schema, dict):" in line for line in lines):
return
new_lines = []
inserted = False
for line in lines:
if not inserted and "def get_type(schema):" in line:
new_lines.append(line)
new_lines.append(" if not isinstance(schema, dict):\n")
new_lines.append(" return str(type(schema))\n")
inserted = True
else:
new_lines.append(line)
# Write the patched file
with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding="utf-8") as tmp_file:
tmp_file.writelines(new_lines)
# Replace the original file
shutil.move(tmp_file.name, file_path)
print("Successfully patched gradio_client")
except Exception as e:
print(f"Could not patch gradio_client: {e}")
# Apply the patch
patch_gradio_client()
class LinkSpider(scrapy.Spider):
name = "link_spider"
def __init__(self, start_url, output_dir, max_pages=50, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = [start_url]
self.output_dir = output_dir
self.visited_urls = set()
self.max_pages = max_pages
self.page_count = 0
def parse(self, response):
if response.url in self.visited_urls or self.page_count >= self.max_pages:
return
self.visited_urls.add(response.url)
self.page_count += 1
# Save the file
parsed_url = urlparse(response.url)
path = parsed_url.path.strip("/")
filename = "index.html" if not path else path.replace("/", "_") + ".html"
# Ensure filename is valid
filename = "".join(c for c in filename if c.isalnum() or c in ('_', '.', '-'))
if not filename.endswith(".html"):
filename += ".html"
file_path = os.path.join(self.output_dir, filename)
try:
with open(file_path, "wb") as f:
f.write(response.body)
except OSError:
# If filename is too long or invalid, use a hash
import hashlib
filename = hashlib.md5(response.url.encode()).hexdigest() + ".html"
file_path = os.path.join(self.output_dir, filename)
with open(file_path, "wb") as f:
f.write(response.body)
# Follow links if we haven't reached the limit
if self.page_count < self.max_pages:
for href in response.css('a::attr(href)').getall():
if href.startswith(('http://', 'https://')):
yield response.follow(href, self.parse)
def crawl_and_zip(start_url):
if not start_url.startswith(('http://', 'https://')):
raise gr.Error("URLはhttp://またはhttps://で始まる必要があります")
temp_dir = tempfile.mkdtemp()
output_dir = os.path.join(temp_dir, "pages")
os.makedirs(output_dir, exist_ok=True)
try:
process = CrawlerProcess(settings={
"LOG_ENABLED": False,
"USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"ROBOTSTXT_OBEY": False,
"DOWNLOAD_DELAY": 1,
})
process.crawl(LinkSpider, start_url=start_url, output_dir=output_dir, max_pages=50)
process.start()
# Create ZIP file
zip_path = os.path.join(temp_dir, "website_crawl.zip")
with zipfile.ZipFile(zip_path, "w") as zipf:
for root, _, files in os.walk(output_dir):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, output_dir)
zipf.write(file_path, arcname=arcname)
# Get file info
file_list = []
for root, _, files in os.walk(output_dir):
for file in files:
rel_dir = os.path.relpath(root, output_dir)
rel_file = os.path.join(rel_dir, file) if rel_dir != "." else file
file_list.append(rel_file)
file_count = len(file_list)
file_structure = "\n".join(file_list)
return zip_path, f"クロール完了!\n\nファイル数: {file_count}\n\nファイル一覧:\n{file_structure}"
except Exception as e:
shutil.rmtree(temp_dir, ignore_errors=True)
raise gr.Error(f"クロール中にエラーが発生しました: {str(e)}")
finally:
# Clean up (the ZIP file will be served before this)
shutil.rmtree(output_dir, ignore_errors=True)
# Gradio interface
with gr.Blocks(title="ウェブクローラー") as demo:
gr.Markdown("## 🌐 ウェブページクローラー")
gr.Markdown("指定したURLから開始して、ウェブページをクロールし、ZIPファイルとしてダウンロードします。")
with gr.Row():
url_input = gr.Textbox(
label="開始URL",
placeholder="https://example.com",
max_lines=1
)
run_button = gr.Button("クロール開始", variant="primary")
with gr.Row():
zip_output = gr.File(label="ダウンロードファイル")
info_output = gr.Textbox(label="クロール結果", interactive=False)
run_button.click(
fn=crawl_and_zip,
inputs=url_input,
outputs=[zip_output, info_output],
)
if __name__ == "__main__":
demo.launch()