palisade / src /document_loader.py
Jina Camellia Yoo
upload full project structure
01661a1
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from typing import List
import os
class VehicleManualLoader:
def __init__(self, file_path: str):
self.file_path = file_path
self.documents = []
if not os.path.exists(file_path):
raise FileNotFoundError(f"PDF ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {file_path}")
def load_pdf(self) -> List[Document]:
print(f"PDF ๋กœ๋”ฉ ์ค‘: {self.file_path}")
loader = PyPDFLoader(self.file_path)
self.documents = loader.load()
print(f"์ด {len(self.documents)} ํŽ˜์ด์ง€ ๋กœ๋“œ ์™„๋ฃŒ")
self._preview_documents()
return self.documents
def _preview_documents(self, num_pages: int = 2):
print("\n ๋ฌธ์„œ ๋ฏธ๋ฆฌ ๋ณด๊ธฐ: ")
print("-" * 50)
for i, doc in enumerate(self.documents[:num_pages]):
print(f"\n[ํŽ˜์ด์ง€ {i+1}]")
print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ: {doc.metadata}")
content_preview = doc.page_content[:500]
print(f"๋‚ด์šฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {content_preview} ...")
print("-" * 50)
def get_page(self, page_num: int) -> Document:
if not self.documents:
raise ValueError("๋จผ์ € load_pdf()๋ฅผ ์‹คํ–‰ํ•ด์ฃผ์„ธ์š”.")
if page_num < 1 or page_num > len(self.documents):
raise ValueError(f"ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ๋Š” 1~{len(self.documents)} ์‚ฌ์ด์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค")
return self.documents[page_num - 1]
def search_keyword(self ,keyword: str) -> List[tuple]:
if not self.documents:
raise ValueError("๋จผ์ € load_pdf()๋ฅผ ์‹คํ–‰ํ•ด์ฃผ์„ธ์š”.")
results = []
for i, doc in enumerate(self.documents):
if keyword.lower() in doc.page_content.lower():
sentences = doc.page_content.split(".")
for sentence in sentences:
if keyword.lower() in sentence.lower():
results.append((i+1, sentence.strip()))
return results
if __name__ == "__main__":
pdf_path = r"C:\Users\Admin\Desktop\vehicle-manual-rag\data\LX3_2026_ko_KR.pdf"
try:
loader = VehicleManualLoader(pdf_path)
documents = loader.load_pdf()
print(f"\n ๋ฌธ์„œ ํ†ต๊ณ„:")
print(f"- ์ด ํŽ˜์ด์ง€ ์ˆ˜: {len(documents)}")
total_chars = sum(len(doc.page_content) for doc in documents)
print(f" - ์ด ๋ฌธ์ž ์ˆ˜: {total_chars}")
print(f" - ํ‰๊ท  ํŽ˜์ด์ง€๋‹น ๋ฌธ์ž: {total_chars // len(documents):,}")
print("\n '์—”์ง„ ์˜ค์ผ' ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ: ")
results = loader.search_keyword("์—”์ง„ ์˜ค์ผ")
for page_num, sentence in results[:3]:
print(f" - {page_num}ํŽ˜์ด์ง€: {sentence[:100]}")
except FileNotFoundError as e:
print(f"์—๋Ÿฌ: {e}")
print("PDF ํŒŒ์ผ์„ 'data' ํด๋”์— ๋„ฃ์–ด์ฃผ์„ธ์š”")