eat2fit / knowledge_base.py
DurgaDeepak's picture
Rename agent.py to knowledge_base.py
3e6f99d verified
raw
history blame
1.25 kB
# knowledge_base.py
import os
import fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
CHROMA_DIR = "chroma"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def load_and_chunk_pdfs(folder_path):
documents = []
for filename in os.listdir(folder_path):
if filename.endswith(".pdf"):
path = os.path.join(folder_path, filename)
doc = fitz.open(path)
text = "\n".join(page.get_text() for page in doc)
documents.append(Document(page_content=text, metadata={"source": filename}))
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(documents)
return chunks
def create_vectorstore(chunks):
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR)
db.persist()
return db
def load_vectorstore():
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
return Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)