Spaces:

hugging2021
/

open-webui-rag-system

Runtime error

App Files Files Community

open-webui-rag-system / vector_store.py

hugging2021

Update vector_store.py

757d2c1 verified 7 days ago

raw

history blame contribute delete

5.21 kB

	import os
	import argparse
	import logging
	import time
	from collections import defaultdict

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	# PyMuPDF library
	try:
	import fitz # PyMuPDF
	PYMUPDF_AVAILABLE = True
	print("✅ PyMuPDF library available")
	except ImportError:
	PYMUPDF_AVAILABLE = False
	print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")


	# --------------------------------
	# Log Output
	# --------------------------------

	def log(msg):
	print(f"[{time.strftime('%H:%M:%S')}] {msg}")

	# --------------------------------
	# Text Cleaning Function
	# --------------------------------

	def clean_text(text):
	return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)

	def apply_corrections(text):
	corrections = {
	'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
	'â€™': "'", 'â€œ': '"', 'â€': '"'
	}
	for k, v in corrections.items():
	text = text.replace(k, v)
	return text

	# --------------------------------
	# Load the embedding model
	def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
	return HuggingFaceEmbeddings(
	model_name=model_name,
	model_kwargs={'device': device},
	encode_kwargs={'normalize_embeddings': True}
	)

	def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16):
	if not documents:
	raise ValueError("No documents found. Check if documents are loaded correctly.")

	texts = [doc.page_content for doc in documents]
	metadatas = [doc.metadata for doc in documents]

	# Split into batches
	batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
	metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]

	print(f"Processing {len(batches)} batches with size {batch_size}")
	print(f"Initializing vector store with batch 1/{len(batches)}")

	# Use from_documents instead of from_texts (to prevent length issues)
	first_docs = [
	Document(page_content=text, metadata=meta)
	for text, meta in zip(batches[0], metadata_batches[0])
	]
	vectorstore = FAISS.from_documents(first_docs, embeddings)

	# Add remaining batches
	for i in tqdm(range(1, len(batches)), desc="Processing batches"):
	try:
	docs_batch = [
	Document(page_content=text, metadata=meta)
	for text, meta in zip(batches[i], metadata_batches[i])
	]
	vectorstore.add_documents(docs_batch)

	if i % 10 == 0:
	temp_save_path = f"{save_path}_temp"
	os.makedirs(os.path.dirname(temp_save_path) if os.path.dirname(temp_save_path) else '.', exist_ok=True)
	vectorstore.save_local(temp_save_path)
	print(f"Temporary vector store saved to {temp_save_path} after batch {i}")

	except Exception as e:
	print(f"Error processing batch {i}: {e}")
	error_save_path = f"{save_path}_error_at_batch_{i}"
	os.makedirs(os.path.dirname(error_save_path) if os.path.dirname(error_save_path) else '.', exist_ok=True)
	vectorstore.save_local(error_save_path)
	print(f"Partial vector store saved to {error_save_path}")
	raise

	os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True)
	vectorstore.save_local(save_path)
	print(f"Vector store saved to {save_path}")

	return vectorstore

	def load_vector_store(embeddings, load_path="vector_db"):
	if not os.path.exists(load_path):
	raise FileNotFoundError(f"Cannot find vector store: {load_path}")
	return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Builds a vector store")
	parser.add_argument("--folder", type=str, default="dataset", help="Path to the folder containing the documents")
	parser.add_argument("--save_path", type=str, default="vector_db", help="Path to save the vector store")
	parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
	parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Name of the embedding model")
	parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to use ('cuda' or 'cpu')")

	args = parser.parse_args()

	# Import the document processing module
	from document_processor import load_documents, split_documents

	# Load and split documents
	documents = load_documents(args.folder)
	chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)

	# Load the embedding model
	embeddings = get_embeddings(model_name=args.model_name, device=args.device)

	# Build the vector store
	build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)