File size: 2,917 Bytes
01661a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from typing import List
import os

class VehicleManualLoader:
    def __init__(self, file_path: str):

        self.file_path = file_path
        self.documents = []

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"PDF ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {file_path}")

    def load_pdf(self) -> List[Document]:
        print(f"PDF ๋กœ๋”ฉ ์ค‘: {self.file_path}")

        loader = PyPDFLoader(self.file_path)
        self.documents = loader.load()

        print(f"์ด {len(self.documents)} ํŽ˜์ด์ง€ ๋กœ๋“œ ์™„๋ฃŒ")

        self._preview_documents()
        return self.documents

    def _preview_documents(self, num_pages: int = 2):
        print("\n ๋ฌธ์„œ ๋ฏธ๋ฆฌ ๋ณด๊ธฐ: ")
        print("-" * 50)

        for i, doc in enumerate(self.documents[:num_pages]):
            print(f"\n[ํŽ˜์ด์ง€ {i+1}]")
            print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ: {doc.metadata}")

            content_preview = doc.page_content[:500]
            print(f"๋‚ด์šฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {content_preview} ...")
            print("-" * 50)

    def get_page(self, page_num: int) -> Document:
        if not self.documents:
            raise ValueError("๋จผ์ € load_pdf()๋ฅผ ์‹คํ–‰ํ•ด์ฃผ์„ธ์š”.")

        if page_num < 1 or page_num > len(self.documents):
            raise ValueError(f"ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ๋Š” 1~{len(self.documents)} ์‚ฌ์ด์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค")

        return self.documents[page_num - 1]

    def search_keyword(self ,keyword: str) -> List[tuple]:
        if not self.documents:
            raise ValueError("๋จผ์ € load_pdf()๋ฅผ ์‹คํ–‰ํ•ด์ฃผ์„ธ์š”.")

        results = []

        for i, doc in enumerate(self.documents):
            if keyword.lower() in doc.page_content.lower():
                sentences = doc.page_content.split(".")
                for sentence in sentences:
                    if keyword.lower() in sentence.lower():
                        results.append((i+1, sentence.strip()))

        return results


if __name__ == "__main__":
    pdf_path = r"C:\Users\Admin\Desktop\vehicle-manual-rag\data\LX3_2026_ko_KR.pdf"

    try:
        loader = VehicleManualLoader(pdf_path)

        documents = loader.load_pdf()

        print(f"\n ๋ฌธ์„œ ํ†ต๊ณ„:")
        print(f"- ์ด ํŽ˜์ด์ง€ ์ˆ˜: {len(documents)}")

        total_chars = sum(len(doc.page_content) for doc in documents)
        print(f" - ์ด ๋ฌธ์ž ์ˆ˜: {total_chars}")
        print(f" - ํ‰๊ท  ํŽ˜์ด์ง€๋‹น ๋ฌธ์ž: {total_chars // len(documents):,}")

        print("\n '์—”์ง„ ์˜ค์ผ' ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ: ")
        results = loader.search_keyword("์—”์ง„ ์˜ค์ผ")
        for page_num, sentence in results[:3]:
            print(f" - {page_num}ํŽ˜์ด์ง€: {sentence[:100]}")

    except FileNotFoundError as e:
        print(f"์—๋Ÿฌ: {e}")
        print("PDF ํŒŒ์ผ์„ 'data' ํด๋”์— ๋„ฃ์–ด์ฃผ์„ธ์š”")