|
from sentence_transformers import SentenceTransformer |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_core.documents import Document |
|
from typing import List, Tuple |
|
import numpy as np |
|
import pickle |
|
import os |
|
from tqdm import tqdm |
|
import time |
|
|
|
|
|
class VehicleManualEmbeddings: |
|
|
|
|
|
def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"): |
|
|
|
print(f" ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ฉ ์ค: {model_name}") |
|
print(" (์ฒซ ์คํ์ ๋ชจ๋ธ ๋ค์ด๋ก๋๋ก ์๊ฐ์ด ๊ฑธ๋ฆด ์ ์์ต๋๋ค)") |
|
|
|
|
|
self.embeddings = HuggingFaceEmbeddings( |
|
model_name=model_name, |
|
model_kwargs={'device': 'cpu'}, |
|
encode_kwargs={'normalize_embeddings': True} |
|
) |
|
|
|
self.vector_store = None |
|
self.index_path = "data/faiss_index" |
|
|
|
print("์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ ์๋ฃ") |
|
|
|
def create_vector_store(self, chunks: List[Document], save: bool = True) -> FAISS: |
|
print(f" {len(chunks)}๊ฐ ์ฒญํฌ๋ฅผ ๋ฒกํฐ๋ก ๋ณํ ์ค...") |
|
print(" (6000๊ฐ ๊ธฐ์ค ์ฝ 2-5๋ถ ์์)") |
|
|
|
start_time = time.time() |
|
|
|
|
|
batch_size = 100 |
|
all_texts = [chunk.page_content for chunk in chunks] |
|
all_metadatas = [chunk.metadata for chunk in chunks] |
|
|
|
|
|
self.vector_store = FAISS.from_documents( |
|
documents=chunks[:batch_size], |
|
embedding=self.embeddings |
|
) |
|
|
|
|
|
for i in tqdm(range(batch_size, len(chunks), batch_size), desc="๋ฒกํฐํ ์งํ"): |
|
batch_chunks = chunks[i:i + batch_size] |
|
if batch_chunks: |
|
self.vector_store.add_documents(batch_chunks) |
|
|
|
elapsed_time = time.time() - start_time |
|
print(f"๋ฒกํฐํ ์๋ฃ! (์์์๊ฐ: {elapsed_time:.1f}์ด)") |
|
|
|
|
|
if save: |
|
self.save_index() |
|
|
|
|
|
self._print_statistics() |
|
|
|
return self.vector_store |
|
|
|
def save_index(self): |
|
|
|
if not self.vector_store: |
|
raise ValueError("๋จผ์ create_vector_store()๋ฅผ ์คํํ์ธ์") |
|
|
|
print(f"์ธ๋ฑ์ค ์ ์ฅ ์ค: {self.index_path}") |
|
|
|
|
|
os.makedirs(os.path.dirname(self.index_path), exist_ok=True) |
|
|
|
|
|
self.vector_store.save_local(self.index_path) |
|
|
|
print("์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ") |
|
|
|
def load_index(self) -> FAISS: |
|
if not os.path.exists(self.index_path): |
|
raise FileNotFoundError(f"์ธ๋ฑ์ค ํ์ผ์ด ์์ต๋๋ค: {self.index_path}") |
|
|
|
print(f"์ธ๋ฑ์ค ๋ก๋ฉ ์ค: {self.index_path}") |
|
self.vector_store = FAISS.load_local( |
|
self.index_path, |
|
self.embeddings, |
|
allow_dangerous_deserialization=True |
|
) |
|
print("์ธ๋ฑ์ค ๋ก๋ ์๋ฃ") |
|
|
|
return self.vector_store |
|
|
|
def similarity_search(self, query: str, k: int = 3) -> List[Document]: |
|
if not self.vector_store: |
|
raise ValueError("๋จผ์ create_vector_store() ๋๋ load_index()๋ฅผ ์คํํ์ธ์") |
|
|
|
|
|
results = self.vector_store.similarity_search(query, k=k) |
|
|
|
return results |
|
|
|
def similarity_search_with_score(self, query: str, k: int = 3) -> List[Tuple[Document, float]]: |
|
if not self.vector_store: |
|
raise ValueError("๋จผ์ create_vector_store() ๋๋ load_index()๋ฅผ ์คํํ์ธ์") |
|
|
|
results = self.vector_store.similarity_search_with_score(query, k=k) |
|
|
|
return results |
|
|
|
def _print_statistics(self): |
|
if not self.vector_store: |
|
return |
|
|
|
|
|
print("\n ๋ฒกํฐ ์ ์ฅ์ ํต๊ณ:") |
|
print(f" - ์ ์ฅ๋ ๋ฒกํฐ ์: {self.vector_store.index.ntotal:,}๊ฐ") |
|
print(f" - ๋ฒกํฐ ์ฐจ์: {self.vector_store.index.d}์ฐจ์") |
|
print(f" - ์ธ๋ฑ์ค ํ์
: {type(self.vector_store.index).__name__}") |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
from document_loader import VehicleManualLoader |
|
from text_splitter import VehicleManualTextSplitter |
|
import os |
|
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|
project_root = os.path.dirname(current_dir) |
|
pdf_path = os.path.join(project_root, "data", "LX3_2026_ko_KR.pdf") |
|
index_path = os.path.join(project_root, "data", "faiss_index") |
|
|
|
print("=" * 60) |
|
print("์ฐจ๋ ๋งค๋ด์ผ ์๋ฒ ๋ฉ ๋ฐ ๋ฒกํฐ ๊ฒ์ ํ
์คํธ") |
|
print("=" * 60) |
|
|
|
|
|
embedder = VehicleManualEmbeddings() |
|
|
|
|
|
if os.path.exists(index_path): |
|
print("\n๊ธฐ์กด ์ธ๋ฑ์ค ๋ฐ๊ฒฌ! ๋ก๋ํฉ๋๋ค...") |
|
vector_store = embedder.load_index() |
|
else: |
|
print("\n์๋ก์ด ์ธ๋ฑ์ค๋ฅผ ์์ฑํฉ๋๋ค...") |
|
|
|
|
|
print("\n1. PDF ๋ก๋ฉ...") |
|
loader = VehicleManualLoader(pdf_path) |
|
documents = loader.load_pdf() |
|
|
|
|
|
print("\n2๏ธ. ํ
์คํธ ๋ถํ ...") |
|
splitter = VehicleManualTextSplitter(chunk_size=500, chunk_overlap=100) |
|
chunks = splitter.split_documents(documents) |
|
|
|
|
|
print("\n3๏ธ. ๋ฒกํฐํ ์์...") |
|
vector_store = embedder.create_vector_store(chunks, save=True) |
|
|
|
|
|
print("\n4๏ธ. ๊ฒ์ ํ
์คํธ") |
|
print("-" * 50) |
|
|
|
test_queries = [ |
|
"์์ง ์ค์ผ ๊ต์ฒด ์ฃผ๊ธฐ๋?", |
|
"ํ์ด์ด ๊ณต๊ธฐ์์ ์ผ๋ง๊ฐ ์ ์ ํ๊ฐ์?", |
|
"์์ดํผ ๊ต์ฒด ๋ฐฉ๋ฒ", |
|
"๊ฒฝ๊ณ ๋ฑ์ด ์ผ์ก์ ๋ ๋์ฒ๋ฒ", |
|
"๋ธ๋ ์ดํฌ ํจ๋ ์ ๊ฒ" |
|
] |
|
|
|
for query in test_queries[:3]: |
|
print(f"\n ์ง๋ฌธ: {query}") |
|
|
|
|
|
results = embedder.similarity_search_with_score(query, k=2) |
|
|
|
for i, (doc, score) in enumerate(results): |
|
print(f"\n [{i + 1}] ์ ์ฌ๋: {score:.3f}") |
|
print(f" ํ์ด์ง: {doc.metadata.get('page', 'N/A')}") |
|
print(f" ์น์
: {doc.metadata.get('section', 'N/A')}") |
|
print(f" ๋ด์ฉ: {doc.page_content[:150]}...") |
|
|
|
print("\n" + "=" * 60) |
|
print("๋ชจ๋ ํ
์คํธ ์๋ฃ!") |
|
print("=" * 60) |