Spaces:

loringuyen
/

QA-system-in-Vietnamese-law

Sleeping

App Files Files Community

QA-system-in-Vietnamese-law / crawler.py

loringuyen

Upload folder using huggingface_hub

32238e9 verified 3 months ago

raw

history blame contribute delete

3.12 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import os
	import time

	# ================================
	# CONFIG
	# ================================
	BASE_URL = "https://thuvienphapluat.vn"
	CATEGORY_URL = "https://thuvienphapluat.vn/van-ban/linh-vuc/giao-thong-van-tai?page={}"
	HEADERS = {"User-Agent": "Mozilla/5.0"}
	SAVE_DIR = "data"
	SAVE_PATH = os.path.join(SAVE_DIR, "legal_corpus.json")

	# ================================
	# Step 1: Lấy danh sách các link văn bản luật
	# ================================
	def get_law_links(page_url):
	res = requests.get(page_url, headers=HEADERS)
	soup = BeautifulSoup(res.text, 'html.parser')
	law_links = []

	for a in soup.select(".ul-list-doc li a.title"):
	href = a.get('href')
	if href and href.startswith("/van-ban/"):
	full_url = BASE_URL + href
	law_links.append(full_url)

	return law_links

	# ================================
	# Step 2: Parse chi tiết 1 văn bản luật
	# ================================
	def parse_law_detail(url):
	try:
	res = requests.get(url, headers=HEADERS)
	soup = BeautifulSoup(res.text, 'html.parser')

	law_id = url.strip('/').split('/')[-1].split('.')[0].lower()
	content = soup.select("div.content-doc > h3, div.content-doc > p")

	articles = []
	current_article = None

	for tag in content:
	if tag.name == 'h3' and 'Điều' in tag.text:
	article_id = tag.text.split('.')[0].strip().replace("Điều ", "")
	current_article = {
	"article_id": article_id,
	"title": tag.text.strip(),
	"text": ""
	}
	articles.append(current_article)
	elif tag.name == 'p' and current_article:
	clause_text = tag.text.strip()
	if clause_text:
	current_article["text"] += " " + clause_text

	return {
	"law_id": law_id,
	"articles": articles
	}

	except Exception as e:
	print(f"[ERROR] Lỗi khi crawl {url}: {e}")
	return None

	# ================================
	# Step 3: Crawl toàn bộ nhiều trang
	# ================================
	def crawl_all(pages=2):
	if not os.path.exists(SAVE_DIR):
	os.makedirs(SAVE_DIR)

	all_laws = []
	for page in range(1, pages + 1):
	print(f"\n🔎 Crawling category page {page}...")
	page_url = CATEGORY_URL.format(page)
	links = get_law_links(page_url)

	for link in links:
	print(f"📄 Crawling law: {link}")
	law_data = parse_law_detail(link)
	if law_data:
	all_laws.append(law_data)
	time.sleep(2)

	with open(SAVE_PATH, "w", encoding="utf-8") as f:
	json.dump(all_laws, f, ensure_ascii=False, indent=2)

	print(f"\n✅ DONE. Đã lưu {len(all_laws)} văn bản vào: {SAVE_PATH}")

	# ================================
	# RUN
	# ================================
	if __name__ == "__main__":
	crawl_all(pages=3) # Chỉnh số trang