Spaces:
Sleeping
Sleeping
# knowledge_base.py | |
import os | |
import fitz # PyMuPDF | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.docstore.document import Document | |
CHROMA_DIR = "chroma" | |
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
def load_and_chunk_pdfs(folder_path): | |
documents = [] | |
for filename in os.listdir(folder_path): | |
if filename.endswith(".pdf"): | |
path = os.path.join(folder_path, filename) | |
doc = fitz.open(path) | |
text = "\n".join(page.get_text() for page in doc) | |
documents.append(Document(page_content=text, metadata={"source": filename})) | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
chunks = splitter.split_documents(documents) | |
return chunks | |
def create_vectorstore(chunks): | |
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) | |
db.persist() | |
return db | |
def load_vectorstore(): | |
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
return Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings) | |