|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain_core.documents import Document |
|
from typing import List |
|
import os |
|
|
|
class VehicleManualLoader: |
|
def __init__(self, file_path: str): |
|
|
|
self.file_path = file_path |
|
self.documents = [] |
|
|
|
if not os.path.exists(file_path): |
|
raise FileNotFoundError(f"PDF ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค: {file_path}") |
|
|
|
def load_pdf(self) -> List[Document]: |
|
print(f"PDF ๋ก๋ฉ ์ค: {self.file_path}") |
|
|
|
loader = PyPDFLoader(self.file_path) |
|
self.documents = loader.load() |
|
|
|
print(f"์ด {len(self.documents)} ํ์ด์ง ๋ก๋ ์๋ฃ") |
|
|
|
self._preview_documents() |
|
return self.documents |
|
|
|
def _preview_documents(self, num_pages: int = 2): |
|
print("\n ๋ฌธ์ ๋ฏธ๋ฆฌ ๋ณด๊ธฐ: ") |
|
print("-" * 50) |
|
|
|
for i, doc in enumerate(self.documents[:num_pages]): |
|
print(f"\n[ํ์ด์ง {i+1}]") |
|
print(f"๋ฉํ๋ฐ์ดํฐ: {doc.metadata}") |
|
|
|
content_preview = doc.page_content[:500] |
|
print(f"๋ด์ฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {content_preview} ...") |
|
print("-" * 50) |
|
|
|
def get_page(self, page_num: int) -> Document: |
|
if not self.documents: |
|
raise ValueError("๋จผ์ load_pdf()๋ฅผ ์คํํด์ฃผ์ธ์.") |
|
|
|
if page_num < 1 or page_num > len(self.documents): |
|
raise ValueError(f"ํ์ด์ง ๋ฒํธ๋ 1~{len(self.documents)} ์ฌ์ด์ฌ์ผ ํฉ๋๋ค") |
|
|
|
return self.documents[page_num - 1] |
|
|
|
def search_keyword(self ,keyword: str) -> List[tuple]: |
|
if not self.documents: |
|
raise ValueError("๋จผ์ load_pdf()๋ฅผ ์คํํด์ฃผ์ธ์.") |
|
|
|
results = [] |
|
|
|
for i, doc in enumerate(self.documents): |
|
if keyword.lower() in doc.page_content.lower(): |
|
sentences = doc.page_content.split(".") |
|
for sentence in sentences: |
|
if keyword.lower() in sentence.lower(): |
|
results.append((i+1, sentence.strip())) |
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
pdf_path = r"C:\Users\Admin\Desktop\vehicle-manual-rag\data\LX3_2026_ko_KR.pdf" |
|
|
|
try: |
|
loader = VehicleManualLoader(pdf_path) |
|
|
|
documents = loader.load_pdf() |
|
|
|
print(f"\n ๋ฌธ์ ํต๊ณ:") |
|
print(f"- ์ด ํ์ด์ง ์: {len(documents)}") |
|
|
|
total_chars = sum(len(doc.page_content) for doc in documents) |
|
print(f" - ์ด ๋ฌธ์ ์: {total_chars}") |
|
print(f" - ํ๊ท ํ์ด์ง๋น ๋ฌธ์: {total_chars // len(documents):,}") |
|
|
|
print("\n '์์ง ์ค์ผ' ํค์๋ ๊ฒ์ ๊ฒฐ๊ณผ: ") |
|
results = loader.search_keyword("์์ง ์ค์ผ") |
|
for page_num, sentence in results[:3]: |
|
print(f" - {page_num}ํ์ด์ง: {sentence[:100]}") |
|
|
|
except FileNotFoundError as e: |
|
print(f"์๋ฌ: {e}") |
|
print("PDF ํ์ผ์ 'data' ํด๋์ ๋ฃ์ด์ฃผ์ธ์") |