import os import shutil import tempfile import zipfile import gradio as gr import scrapy from scrapy.crawler import CrawlerProcess from urllib.parse import urlparse # Fix for gradio_client issue def patch_gradio_client(): file_path = "/usr/local/lib/python3.10/site-packages/gradio_client/utils.py" try: with open(file_path, "r", encoding="utf-8") as f: lines = f.readlines() # Check if patch is already applied if any("if not isinstance(schema, dict):" in line for line in lines): return new_lines = [] inserted = False for line in lines: if not inserted and "def get_type(schema):" in line: new_lines.append(line) new_lines.append(" if not isinstance(schema, dict):\n") new_lines.append(" return str(type(schema))\n") inserted = True else: new_lines.append(line) # Write the patched file with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding="utf-8") as tmp_file: tmp_file.writelines(new_lines) # Replace the original file shutil.move(tmp_file.name, file_path) print("Successfully patched gradio_client") except Exception as e: print(f"Could not patch gradio_client: {e}") # Apply the patch patch_gradio_client() class LinkSpider(scrapy.Spider): name = "link_spider" def __init__(self, start_url, output_dir, max_pages=50, *args, **kwargs): super().__init__(*args, **kwargs) self.start_urls = [start_url] self.output_dir = output_dir self.visited_urls = set() self.max_pages = max_pages self.page_count = 0 def parse(self, response): if response.url in self.visited_urls or self.page_count >= self.max_pages: return self.visited_urls.add(response.url) self.page_count += 1 # Save the file parsed_url = urlparse(response.url) path = parsed_url.path.strip("/") filename = "index.html" if not path else path.replace("/", "_") + ".html" # Ensure filename is valid filename = "".join(c for c in filename if c.isalnum() or c in ('_', '.', '-')) if not filename.endswith(".html"): filename += ".html" file_path = os.path.join(self.output_dir, filename) try: with open(file_path, "wb") as f: f.write(response.body) except OSError: # If filename is too long or invalid, use a hash import hashlib filename = hashlib.md5(response.url.encode()).hexdigest() + ".html" file_path = os.path.join(self.output_dir, filename) with open(file_path, "wb") as f: f.write(response.body) # Follow links if we haven't reached the limit if self.page_count < self.max_pages: for href in response.css('a::attr(href)').getall(): if href.startswith(('http://', 'https://')): yield response.follow(href, self.parse) def crawl_and_zip(start_url): if not start_url.startswith(('http://', 'https://')): raise gr.Error("URLはhttp://またはhttps://で始まる必要があります") temp_dir = tempfile.mkdtemp() output_dir = os.path.join(temp_dir, "pages") os.makedirs(output_dir, exist_ok=True) try: process = CrawlerProcess(settings={ "LOG_ENABLED": False, "USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "ROBOTSTXT_OBEY": False, "DOWNLOAD_DELAY": 1, }) process.crawl(LinkSpider, start_url=start_url, output_dir=output_dir, max_pages=50) process.start() # Create ZIP file zip_path = os.path.join(temp_dir, "website_crawl.zip") with zipfile.ZipFile(zip_path, "w") as zipf: for root, _, files in os.walk(output_dir): for file in files: file_path = os.path.join(root, file) arcname = os.path.relpath(file_path, output_dir) zipf.write(file_path, arcname=arcname) # Get file info file_list = [] for root, _, files in os.walk(output_dir): for file in files: rel_dir = os.path.relpath(root, output_dir) rel_file = os.path.join(rel_dir, file) if rel_dir != "." else file file_list.append(rel_file) file_count = len(file_list) file_structure = "\n".join(file_list) return zip_path, f"クロール完了!\n\nファイル数: {file_count}\n\nファイル一覧:\n{file_structure}" except Exception as e: shutil.rmtree(temp_dir, ignore_errors=True) raise gr.Error(f"クロール中にエラーが発生しました: {str(e)}") finally: # Clean up (the ZIP file will be served before this) shutil.rmtree(output_dir, ignore_errors=True) # Gradio interface with gr.Blocks(title="ウェブクローラー") as demo: gr.Markdown("## 🌐 ウェブページクローラー") gr.Markdown("指定したURLから開始して、ウェブページをクロールし、ZIPファイルとしてダウンロードします。") with gr.Row(): url_input = gr.Textbox( label="開始URL", placeholder="https://example.com", max_lines=1 ) run_button = gr.Button("クロール開始", variant="primary") with gr.Row(): zip_output = gr.File(label="ダウンロードファイル") info_output = gr.Textbox(label="クロール結果", interactive=False) run_button.click( fn=crawl_and_zip, inputs=url_input, outputs=[zip_output, info_output], ) if __name__ == "__main__": demo.launch()