loringuyen's picture
Upload folder using huggingface_hub
32238e9 verified
import requests
from bs4 import BeautifulSoup
import json
import os
import time
# ================================
# CONFIG
# ================================
BASE_URL = "https://thuvienphapluat.vn"
CATEGORY_URL = "https://thuvienphapluat.vn/van-ban/linh-vuc/giao-thong-van-tai?page={}"
HEADERS = {"User-Agent": "Mozilla/5.0"}
SAVE_DIR = "data"
SAVE_PATH = os.path.join(SAVE_DIR, "legal_corpus.json")
# ================================
# Step 1: Lấy danh sách các link văn bản luật
# ================================
def get_law_links(page_url):
res = requests.get(page_url, headers=HEADERS)
soup = BeautifulSoup(res.text, 'html.parser')
law_links = []
for a in soup.select(".ul-list-doc li a.title"):
href = a.get('href')
if href and href.startswith("/van-ban/"):
full_url = BASE_URL + href
law_links.append(full_url)
return law_links
# ================================
# Step 2: Parse chi tiết 1 văn bản luật
# ================================
def parse_law_detail(url):
try:
res = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(res.text, 'html.parser')
law_id = url.strip('/').split('/')[-1].split('.')[0].lower()
content = soup.select("div.content-doc > h3, div.content-doc > p")
articles = []
current_article = None
for tag in content:
if tag.name == 'h3' and 'Điều' in tag.text:
article_id = tag.text.split('.')[0].strip().replace("Điều ", "")
current_article = {
"article_id": article_id,
"title": tag.text.strip(),
"text": ""
}
articles.append(current_article)
elif tag.name == 'p' and current_article:
clause_text = tag.text.strip()
if clause_text:
current_article["text"] += " " + clause_text
return {
"law_id": law_id,
"articles": articles
}
except Exception as e:
print(f"[ERROR] Lỗi khi crawl {url}: {e}")
return None
# ================================
# Step 3: Crawl toàn bộ nhiều trang
# ================================
def crawl_all(pages=2):
if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)
all_laws = []
for page in range(1, pages + 1):
print(f"\n🔎 Crawling category page {page}...")
page_url = CATEGORY_URL.format(page)
links = get_law_links(page_url)
for link in links:
print(f"📄 Crawling law: {link}")
law_data = parse_law_detail(link)
if law_data:
all_laws.append(law_data)
time.sleep(2)
with open(SAVE_PATH, "w", encoding="utf-8") as f:
json.dump(all_laws, f, ensure_ascii=False, indent=2)
print(f"\n✅ DONE. Đã lưu {len(all_laws)} văn bản vào: {SAVE_PATH}")
# ================================
# RUN
# ================================
if __name__ == "__main__":
crawl_all(pages=3) # Chỉnh số trang