palisade / src /embeddings.py
Jina Camellia Yoo
upload full project structure
01661a1
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from typing import List, Tuple
import numpy as np
import pickle
import os
from tqdm import tqdm
import time
class VehicleManualEmbeddings:
def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
print(f" ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {model_name}")
print(" (์ฒซ ์‹คํ–‰์‹œ ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ๋กœ ์‹œ๊ฐ„์ด ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค)")
# HuggingFaceEmbeddings๋Š” LangChain๊ณผ ํ†ตํ•ฉ์ด ์‰ฌ์›€
self.embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'}, # GPU ์—†์–ด๋„ OK
encode_kwargs={'normalize_embeddings': True} # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ ์ตœ์ ํ™”
)
self.vector_store = None
self.index_path = "data/faiss_index" # ์ธ๋ฑ์Šค ์ €์žฅ ๊ฒฝ๋กœ
print("์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ")
def create_vector_store(self, chunks: List[Document], save: bool = True) -> FAISS:
print(f" {len(chunks)}๊ฐœ ์ฒญํฌ๋ฅผ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ ์ค‘...")
print(" (6000๊ฐœ ๊ธฐ์ค€ ์•ฝ 2-5๋ถ„ ์†Œ์š”)")
start_time = time.time()
# ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ๋กœ ์†๋„ ํ–ฅ์ƒ
batch_size = 100
all_texts = [chunk.page_content for chunk in chunks]
all_metadatas = [chunk.metadata for chunk in chunks]
self.vector_store = FAISS.from_documents(
documents=chunks[:batch_size], # ์ฒซ ๋ฐฐ์น˜๋กœ ์ดˆ๊ธฐํ™”
embedding=self.embeddings
)
# ๋‚˜๋จธ์ง€ ๋ฐฐ์น˜ ์ถ”๊ฐ€
for i in tqdm(range(batch_size, len(chunks), batch_size), desc="๋ฒกํ„ฐํ™” ์ง„ํ–‰"):
batch_chunks = chunks[i:i + batch_size]
if batch_chunks: # ๋นˆ ๋ฐฐ์น˜๊ฐ€ ์•„๋‹Œ ๊ฒฝ์šฐ๋งŒ
self.vector_store.add_documents(batch_chunks)
elapsed_time = time.time() - start_time
print(f"๋ฒกํ„ฐํ™” ์™„๋ฃŒ! (์†Œ์š”์‹œ๊ฐ„: {elapsed_time:.1f}์ดˆ)")
# ์ธ๋ฑ์Šค ์ €์žฅ
if save:
self.save_index()
# ํ†ต๊ณ„ ์ถœ๋ ฅ
self._print_statistics()
return self.vector_store
def save_index(self):
if not self.vector_store:
raise ValueError("๋จผ์ € create_vector_store()๋ฅผ ์‹คํ–‰ํ•˜์„ธ์š”")
print(f"์ธ๋ฑ์Šค ์ €์žฅ ์ค‘: {self.index_path}")
# ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
os.makedirs(os.path.dirname(self.index_path), exist_ok=True)
# FAISS ์ธ๋ฑ์Šค ์ €์žฅ
self.vector_store.save_local(self.index_path)
print("์ธ๋ฑ์Šค ์ €์žฅ ์™„๋ฃŒ")
def load_index(self) -> FAISS:
if not os.path.exists(self.index_path):
raise FileNotFoundError(f"์ธ๋ฑ์Šค ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค: {self.index_path}")
print(f"์ธ๋ฑ์Šค ๋กœ๋”ฉ ์ค‘: {self.index_path}")
self.vector_store = FAISS.load_local(
self.index_path,
self.embeddings,
allow_dangerous_deserialization=True # ๋กœ์ปฌ ํŒŒ์ผ์ด๋ฏ€๋กœ ์•ˆ์ „
)
print("์ธ๋ฑ์Šค ๋กœ๋“œ ์™„๋ฃŒ")
return self.vector_store
def similarity_search(self, query: str, k: int = 3) -> List[Document]:
if not self.vector_store:
raise ValueError("๋จผ์ € create_vector_store() ๋˜๋Š” load_index()๋ฅผ ์‹คํ–‰ํ•˜์„ธ์š”")
# ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
results = self.vector_store.similarity_search(query, k=k)
return results
def similarity_search_with_score(self, query: str, k: int = 3) -> List[Tuple[Document, float]]:
if not self.vector_store:
raise ValueError("๋จผ์ € create_vector_store() ๋˜๋Š” load_index()๋ฅผ ์‹คํ–‰ํ•˜์„ธ์š”")
results = self.vector_store.similarity_search_with_score(query, k=k)
return results
def _print_statistics(self):
if not self.vector_store:
return
# FAISS ์ธ๋ฑ์Šค ์ •๋ณด
print("\n ๋ฒกํ„ฐ ์ €์žฅ์†Œ ํ†ต๊ณ„:")
print(f" - ์ €์žฅ๋œ ๋ฒกํ„ฐ ์ˆ˜: {self.vector_store.index.ntotal:,}๊ฐœ")
print(f" - ๋ฒกํ„ฐ ์ฐจ์›: {self.vector_store.index.d}์ฐจ์›")
print(f" - ์ธ๋ฑ์Šค ํƒ€์ž…: {type(self.vector_store.index).__name__}")
# ํ…Œ์ŠคํŠธ ์ฝ”๋“œ
if __name__ == "__main__":
from document_loader import VehicleManualLoader
from text_splitter import VehicleManualTextSplitter
import os
# ๊ฒฝ๋กœ ์„ค์ •
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
pdf_path = os.path.join(project_root, "data", "LX3_2026_ko_KR.pdf")
index_path = os.path.join(project_root, "data", "faiss_index")
print("=" * 60)
print("์ฐจ๋Ÿ‰ ๋งค๋‰ด์–ผ ์ž„๋ฒ ๋”ฉ ๋ฐ ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ")
print("=" * 60)
# ์ž„๋ฒ ๋”ฉ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™”
embedder = VehicleManualEmbeddings()
# ๊ธฐ์กด ์ธ๋ฑ์Šค๊ฐ€ ์žˆ์œผ๋ฉด ๋กœ๋“œ, ์—†์œผ๋ฉด ์ƒˆ๋กœ ์ƒ์„ฑ
if os.path.exists(index_path):
print("\n๊ธฐ์กด ์ธ๋ฑ์Šค ๋ฐœ๊ฒฌ! ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค...")
vector_store = embedder.load_index()
else:
print("\n์ƒˆ๋กœ์šด ์ธ๋ฑ์Šค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค...")
# 1. PDF ๋กœ๋“œ
print("\n1. PDF ๋กœ๋”ฉ...")
loader = VehicleManualLoader(pdf_path)
documents = loader.load_pdf()
# 2. ํ…์ŠคํŠธ ๋ถ„ํ• 
print("\n2๏ธ. ํ…์ŠคํŠธ ๋ถ„ํ• ...")
splitter = VehicleManualTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)
# 3. ๋ฒกํ„ฐํ™” ๋ฐ ์ธ๋ฑ์Šค ์ƒ์„ฑ
print("\n3๏ธ. ๋ฒกํ„ฐํ™” ์‹œ์ž‘...")
vector_store = embedder.create_vector_store(chunks, save=True)
# 4. ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ
print("\n4๏ธ. ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ")
print("-" * 50)
test_queries = [
"์—”์ง„ ์˜ค์ผ ๊ต์ฒด ์ฃผ๊ธฐ๋Š”?",
"ํƒ€์ด์–ด ๊ณต๊ธฐ์••์€ ์–ผ๋งˆ๊ฐ€ ์ ์ •ํ•œ๊ฐ€์š”?",
"์™€์ดํผ ๊ต์ฒด ๋ฐฉ๋ฒ•",
"๊ฒฝ๊ณ ๋“ฑ์ด ์ผœ์กŒ์„ ๋•Œ ๋Œ€์ฒ˜๋ฒ•",
"๋ธŒ๋ ˆ์ดํฌ ํŒจ๋“œ ์ ๊ฒ€"
]
for query in test_queries[:3]: # ์ฒ˜์Œ 3๊ฐœ๋งŒ ํ…Œ์ŠคํŠธ
print(f"\n ์งˆ๋ฌธ: {query}")
# ์œ ์‚ฌ๋„ ์ ์ˆ˜์™€ ํ•จ๊ป˜ ๊ฒ€์ƒ‰
results = embedder.similarity_search_with_score(query, k=2)
for i, (doc, score) in enumerate(results):
print(f"\n [{i + 1}] ์œ ์‚ฌ๋„: {score:.3f}")
print(f" ํŽ˜์ด์ง€: {doc.metadata.get('page', 'N/A')}")
print(f" ์„น์…˜: {doc.metadata.get('section', 'N/A')}")
print(f" ๋‚ด์šฉ: {doc.page_content[:150]}...")
print("\n" + "=" * 60)
print("๋ชจ๋“  ํ…Œ์ŠคํŠธ ์™„๋ฃŒ!")
print("=" * 60)