|
import requests |
|
from bs4 import BeautifulSoup |
|
import json |
|
import os |
|
import time |
|
|
|
|
|
|
|
|
|
BASE_URL = "https://thuvienphapluat.vn" |
|
CATEGORY_URL = "https://thuvienphapluat.vn/van-ban/linh-vuc/giao-thong-van-tai?page={}" |
|
HEADERS = {"User-Agent": "Mozilla/5.0"} |
|
SAVE_DIR = "data" |
|
SAVE_PATH = os.path.join(SAVE_DIR, "legal_corpus.json") |
|
|
|
|
|
|
|
|
|
def get_law_links(page_url): |
|
res = requests.get(page_url, headers=HEADERS) |
|
soup = BeautifulSoup(res.text, 'html.parser') |
|
law_links = [] |
|
|
|
for a in soup.select(".ul-list-doc li a.title"): |
|
href = a.get('href') |
|
if href and href.startswith("/van-ban/"): |
|
full_url = BASE_URL + href |
|
law_links.append(full_url) |
|
|
|
return law_links |
|
|
|
|
|
|
|
|
|
def parse_law_detail(url): |
|
try: |
|
res = requests.get(url, headers=HEADERS) |
|
soup = BeautifulSoup(res.text, 'html.parser') |
|
|
|
law_id = url.strip('/').split('/')[-1].split('.')[0].lower() |
|
content = soup.select("div.content-doc > h3, div.content-doc > p") |
|
|
|
articles = [] |
|
current_article = None |
|
|
|
for tag in content: |
|
if tag.name == 'h3' and 'Điều' in tag.text: |
|
article_id = tag.text.split('.')[0].strip().replace("Điều ", "") |
|
current_article = { |
|
"article_id": article_id, |
|
"title": tag.text.strip(), |
|
"text": "" |
|
} |
|
articles.append(current_article) |
|
elif tag.name == 'p' and current_article: |
|
clause_text = tag.text.strip() |
|
if clause_text: |
|
current_article["text"] += " " + clause_text |
|
|
|
return { |
|
"law_id": law_id, |
|
"articles": articles |
|
} |
|
|
|
except Exception as e: |
|
print(f"[ERROR] Lỗi khi crawl {url}: {e}") |
|
return None |
|
|
|
|
|
|
|
|
|
def crawl_all(pages=2): |
|
if not os.path.exists(SAVE_DIR): |
|
os.makedirs(SAVE_DIR) |
|
|
|
all_laws = [] |
|
for page in range(1, pages + 1): |
|
print(f"\n🔎 Crawling category page {page}...") |
|
page_url = CATEGORY_URL.format(page) |
|
links = get_law_links(page_url) |
|
|
|
for link in links: |
|
print(f"📄 Crawling law: {link}") |
|
law_data = parse_law_detail(link) |
|
if law_data: |
|
all_laws.append(law_data) |
|
time.sleep(2) |
|
|
|
with open(SAVE_PATH, "w", encoding="utf-8") as f: |
|
json.dump(all_laws, f, ensure_ascii=False, indent=2) |
|
|
|
print(f"\n✅ DONE. Đã lưu {len(all_laws)} văn bản vào: {SAVE_PATH}") |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
crawl_all(pages=3) |
|
|