File size: 2,917 Bytes
01661a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from typing import List
import os
class VehicleManualLoader:
def __init__(self, file_path: str):
self.file_path = file_path
self.documents = []
if not os.path.exists(file_path):
raise FileNotFoundError(f"PDF ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค: {file_path}")
def load_pdf(self) -> List[Document]:
print(f"PDF ๋ก๋ฉ ์ค: {self.file_path}")
loader = PyPDFLoader(self.file_path)
self.documents = loader.load()
print(f"์ด {len(self.documents)} ํ์ด์ง ๋ก๋ ์๋ฃ")
self._preview_documents()
return self.documents
def _preview_documents(self, num_pages: int = 2):
print("\n ๋ฌธ์ ๋ฏธ๋ฆฌ ๋ณด๊ธฐ: ")
print("-" * 50)
for i, doc in enumerate(self.documents[:num_pages]):
print(f"\n[ํ์ด์ง {i+1}]")
print(f"๋ฉํ๋ฐ์ดํฐ: {doc.metadata}")
content_preview = doc.page_content[:500]
print(f"๋ด์ฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {content_preview} ...")
print("-" * 50)
def get_page(self, page_num: int) -> Document:
if not self.documents:
raise ValueError("๋จผ์ load_pdf()๋ฅผ ์คํํด์ฃผ์ธ์.")
if page_num < 1 or page_num > len(self.documents):
raise ValueError(f"ํ์ด์ง ๋ฒํธ๋ 1~{len(self.documents)} ์ฌ์ด์ฌ์ผ ํฉ๋๋ค")
return self.documents[page_num - 1]
def search_keyword(self ,keyword: str) -> List[tuple]:
if not self.documents:
raise ValueError("๋จผ์ load_pdf()๋ฅผ ์คํํด์ฃผ์ธ์.")
results = []
for i, doc in enumerate(self.documents):
if keyword.lower() in doc.page_content.lower():
sentences = doc.page_content.split(".")
for sentence in sentences:
if keyword.lower() in sentence.lower():
results.append((i+1, sentence.strip()))
return results
if __name__ == "__main__":
pdf_path = r"C:\Users\Admin\Desktop\vehicle-manual-rag\data\LX3_2026_ko_KR.pdf"
try:
loader = VehicleManualLoader(pdf_path)
documents = loader.load_pdf()
print(f"\n ๋ฌธ์ ํต๊ณ:")
print(f"- ์ด ํ์ด์ง ์: {len(documents)}")
total_chars = sum(len(doc.page_content) for doc in documents)
print(f" - ์ด ๋ฌธ์ ์: {total_chars}")
print(f" - ํ๊ท ํ์ด์ง๋น ๋ฌธ์: {total_chars // len(documents):,}")
print("\n '์์ง ์ค์ผ' ํค์๋ ๊ฒ์ ๊ฒฐ๊ณผ: ")
results = loader.search_keyword("์์ง ์ค์ผ")
for page_num, sentence in results[:3]:
print(f" - {page_num}ํ์ด์ง: {sentence[:100]}")
except FileNotFoundError as e:
print(f"์๋ฌ: {e}")
print("PDF ํ์ผ์ 'data' ํด๋์ ๋ฃ์ด์ฃผ์ธ์") |