# knowledge_base.py import os import fitz # PyMuPDF from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document CHROMA_DIR = "chroma" MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" def load_and_chunk_pdfs(folder_path): documents = [] for filename in os.listdir(folder_path): if filename.endswith(".pdf"): path = os.path.join(folder_path, filename) doc = fitz.open(path) text = "\n".join(page.get_text() for page in doc) documents.append(Document(page_content=text, metadata={"source": filename})) splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = splitter.split_documents(documents) return chunks def create_vectorstore(chunks): embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) db.persist() return db def load_vectorstore(): embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) return Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)