|
import os |
|
import logging |
|
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import FAISS |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
|
|
|
|
|
data_dir = "data" |
|
if not os.path.exists(data_dir): |
|
logging.error(f"Directory '{data_dir}' does not exist. Please create it and add PDF files.") |
|
exit() |
|
|
|
try: |
|
loader = DirectoryLoader(data_dir, glob="*.pdf", loader_cls=PyPDFLoader) |
|
documents = loader.load() |
|
logging.info(f"Loaded {len(documents)} documents from the '{data_dir}' directory.") |
|
except Exception as e: |
|
logging.error(f"Error loading documents: {e}") |
|
exit() |
|
|
|
|
|
try: |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) |
|
texts = text_splitter.split_documents(documents) |
|
logging.info(f"Split the documents into {len(texts)} text chunks.") |
|
except Exception as e: |
|
logging.error(f"Error splitting documents: {e}") |
|
exit() |
|
|
|
|
|
try: |
|
embeddings = HuggingFaceEmbeddings( |
|
model_name="nomic-ai/nomic-embed-text-v1", |
|
model_kwargs={"trust_remote_code": True, "revision": "289f532e14dbbbd5a04753fa58739e9ba766f3c7"} |
|
) |
|
logging.info("Initialized HuggingFace embeddings model successfully.") |
|
except Exception as e: |
|
logging.error(f"Error initializing embeddings model: {e}") |
|
exit() |
|
|
|
|
|
try: |
|
faiss_db = FAISS.from_documents(texts, embeddings) |
|
logging.info("Created FAISS vector database from text embeddings.") |
|
except Exception as e: |
|
logging.error(f"Error creating FAISS vector database: {e}") |
|
exit() |
|
|
|
|
|
try: |
|
save_path = "ipc_vector_db" |
|
faiss_db.save_local(save_path) |
|
logging.info(f"FAISS vector database has been saved locally at '{save_path}'.") |
|
except Exception as e: |
|
logging.error(f"Error saving FAISS vector database: {e}") |
|
exit() |