import requests from bs4 import BeautifulSoup import json import os import time # ================================ # CONFIG # ================================ BASE_URL = "https://thuvienphapluat.vn" CATEGORY_URL = "https://thuvienphapluat.vn/van-ban/linh-vuc/giao-thong-van-tai?page={}" HEADERS = {"User-Agent": "Mozilla/5.0"} SAVE_DIR = "data" SAVE_PATH = os.path.join(SAVE_DIR, "legal_corpus.json") # ================================ # Step 1: Lấy danh sách các link văn bản luật # ================================ def get_law_links(page_url): res = requests.get(page_url, headers=HEADERS) soup = BeautifulSoup(res.text, 'html.parser') law_links = [] for a in soup.select(".ul-list-doc li a.title"): href = a.get('href') if href and href.startswith("/van-ban/"): full_url = BASE_URL + href law_links.append(full_url) return law_links # ================================ # Step 2: Parse chi tiết 1 văn bản luật # ================================ def parse_law_detail(url): try: res = requests.get(url, headers=HEADERS) soup = BeautifulSoup(res.text, 'html.parser') law_id = url.strip('/').split('/')[-1].split('.')[0].lower() content = soup.select("div.content-doc > h3, div.content-doc > p") articles = [] current_article = None for tag in content: if tag.name == 'h3' and 'Điều' in tag.text: article_id = tag.text.split('.')[0].strip().replace("Điều ", "") current_article = { "article_id": article_id, "title": tag.text.strip(), "text": "" } articles.append(current_article) elif tag.name == 'p' and current_article: clause_text = tag.text.strip() if clause_text: current_article["text"] += " " + clause_text return { "law_id": law_id, "articles": articles } except Exception as e: print(f"[ERROR] Lỗi khi crawl {url}: {e}") return None # ================================ # Step 3: Crawl toàn bộ nhiều trang # ================================ def crawl_all(pages=2): if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) all_laws = [] for page in range(1, pages + 1): print(f"\n🔎 Crawling category page {page}...") page_url = CATEGORY_URL.format(page) links = get_law_links(page_url) for link in links: print(f"📄 Crawling law: {link}") law_data = parse_law_detail(link) if law_data: all_laws.append(law_data) time.sleep(2) with open(SAVE_PATH, "w", encoding="utf-8") as f: json.dump(all_laws, f, ensure_ascii=False, indent=2) print(f"\n✅ DONE. Đã lưu {len(all_laws)} văn bản vào: {SAVE_PATH}") # ================================ # RUN # ================================ if __name__ == "__main__": crawl_all(pages=3) # Chỉnh số trang