import requests
from bs4 import BeautifulSoup
import json
import os
import time

# ================================
# CONFIG
# ================================
BASE_URL = "https://thuvienphapluat.vn"
CATEGORY_URL = "https://thuvienphapluat.vn/van-ban/linh-vuc/giao-thong-van-tai?page={}"
HEADERS = {"User-Agent": "Mozilla/5.0"}
SAVE_DIR = "data"
SAVE_PATH = os.path.join(SAVE_DIR, "legal_corpus.json")

# ================================
# Step 1: Lấy danh sách các link văn bản luật
# ================================
def get_law_links(page_url):
    res = requests.get(page_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, 'html.parser')
    law_links = []

    for a in soup.select(".ul-list-doc li a.title"):
        href = a.get('href')
        if href and href.startswith("/van-ban/"):
            full_url = BASE_URL + href
            law_links.append(full_url)

    return law_links

# ================================
# Step 2: Parse chi tiết 1 văn bản luật
# ================================
def parse_law_detail(url):
    try:
        res = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')

        law_id = url.strip('/').split('/')[-1].split('.')[0].lower()
        content = soup.select("div.content-doc > h3, div.content-doc > p")

        articles = []
        current_article = None

        for tag in content:
            if tag.name == 'h3' and 'Điều' in tag.text:
                article_id = tag.text.split('.')[0].strip().replace("Điều ", "")
                current_article = {
                    "article_id": article_id,
                    "title": tag.text.strip(),
                    "text": ""
                }
                articles.append(current_article)
            elif tag.name == 'p' and current_article:
                clause_text = tag.text.strip()
                if clause_text:
                    current_article["text"] += " " + clause_text

        return {
            "law_id": law_id,
            "articles": articles
        }

    except Exception as e:
        print(f"[ERROR] Lỗi khi crawl {url}: {e}")
        return None

# ================================
# Step 3: Crawl toàn bộ nhiều trang
# ================================
def crawl_all(pages=2):
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    all_laws = []
    for page in range(1, pages + 1):
        print(f"\n🔎 Crawling category page {page}...")
        page_url = CATEGORY_URL.format(page)
        links = get_law_links(page_url)

        for link in links:
            print(f"📄 Crawling law: {link}")
            law_data = parse_law_detail(link)
            if law_data:
                all_laws.append(law_data)
            time.sleep(2)

    with open(SAVE_PATH, "w", encoding="utf-8") as f:
        json.dump(all_laws, f, ensure_ascii=False, indent=2)

    print(f"\n✅ DONE. Đã lưu {len(all_laws)} văn bản vào: {SAVE_PATH}")

# ================================
# RUN
# ================================
if __name__ == "__main__":
    crawl_all(pages=3)  # Chỉnh số trang