Spaces:

hugging2021
/

open-webui-rag-system

Runtime error

App Files Files Community

open-webui-rag-system / concat_vector_store_정리된.py

hugging2021

Update concat_vector_store_정리된.py

1d98a7e verified 7 days ago

raw

history blame contribute delete

2.05 kB

	import os
	import glob
	from langchain.schema.document import Document
	from e5_embeddings import E5Embeddings
	from langchain_community.vectorstores import FAISS
	from document_processor import load_pdf_with_pymupdf, split_documents

	# Path configuration
	FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs
	VECTOR_STORE_PATH = "vector_db"

	# 1. Load the embedding model
	def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
	return E5Embeddings(
	model_name=model_name,
	model_kwargs={'device': device},
	encode_kwargs={'normalize_embeddings': True}
	)

	# 2. Load existing vector store
	def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
	if not os.path.exists(load_path):
	raise FileNotFoundError(f"Cannot find vector store: {load_path}")
	return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)

	# 3. Embed only the cleaned PDFs
	def embed_cleaned_pdfs(folder, vectorstore, embeddings):
	pattern = os.path.join(folder, "cleaned*.pdf")
	pdf_files = glob.glob(pattern)
	print(f"Number of target PDFs: {len(pdf_files)}")

	new_documents = []
	for pdf_path in pdf_files:
	print(f"Processing: {pdf_path}")
	text = load_pdf_with_pymupdf(pdf_path)
	if text.strip():
	new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))

	print(f"Number of documents: {len(new_documents)}")

	chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
	print(f"Number of chunks: {len(chunks)}")

	print(f"Vector count before addition: {vectorstore.index.ntotal}")
	vectorstore.add_documents(chunks)
	print(f"Vector count after addition: {vectorstore.index.ntotal}")

	vectorstore.save_local(VECTOR_STORE_PATH)
	print(f"Save completed: {VECTOR_STORE_PATH}")

	# Execution
	if __name__ == "__main__":
	embeddings = get_embeddings()
	vectorstore = load_vector_store(embeddings)
	embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)