Spaces:
Runtime error
Runtime error
import os | |
import argparse | |
import logging | |
import time | |
from collections import defaultdict | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_core.documents import Document | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
# PyMuPDF library | |
try: | |
import fitz # PyMuPDF | |
PYMUPDF_AVAILABLE = True | |
print("✅ PyMuPDF library available") | |
except ImportError: | |
PYMUPDF_AVAILABLE = False | |
print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF") | |
# -------------------------------- | |
# Log Output | |
# -------------------------------- | |
def log(msg): | |
print(f"[{time.strftime('%H:%M:%S')}] {msg}") | |
# -------------------------------- | |
# Text Cleaning Function | |
# -------------------------------- | |
def clean_text(text): | |
return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text) | |
def apply_corrections(text): | |
corrections = { | |
'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '', | |
'’': "'", '“': '"', 'â€': '"' | |
} | |
for k, v in corrections.items(): | |
text = text.replace(k, v) | |
return text | |
# -------------------------------- | |
# Load the embedding model | |
def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"): | |
return HuggingFaceEmbeddings( | |
model_name=model_name, | |
model_kwargs={'device': device}, | |
encode_kwargs={'normalize_embeddings': True} | |
) | |
def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16): | |
if not documents: | |
raise ValueError("No documents found. Check if documents are loaded correctly.") | |
texts = [doc.page_content for doc in documents] | |
metadatas = [doc.metadata for doc in documents] | |
# Split into batches | |
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)] | |
metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)] | |
print(f"Processing {len(batches)} batches with size {batch_size}") | |
print(f"Initializing vector store with batch 1/{len(batches)}") | |
# Use from_documents instead of from_texts (to prevent length issues) | |
first_docs = [ | |
Document(page_content=text, metadata=meta) | |
for text, meta in zip(batches[0], metadata_batches[0]) | |
] | |
vectorstore = FAISS.from_documents(first_docs, embeddings) | |
# Add remaining batches | |
for i in tqdm(range(1, len(batches)), desc="Processing batches"): | |
try: | |
docs_batch = [ | |
Document(page_content=text, metadata=meta) | |
for text, meta in zip(batches[i], metadata_batches[i]) | |
] | |
vectorstore.add_documents(docs_batch) | |
if i % 10 == 0: | |
temp_save_path = f"{save_path}_temp" | |
os.makedirs(os.path.dirname(temp_save_path) if os.path.dirname(temp_save_path) else '.', exist_ok=True) | |
vectorstore.save_local(temp_save_path) | |
print(f"Temporary vector store saved to {temp_save_path} after batch {i}") | |
except Exception as e: | |
print(f"Error processing batch {i}: {e}") | |
error_save_path = f"{save_path}_error_at_batch_{i}" | |
os.makedirs(os.path.dirname(error_save_path) if os.path.dirname(error_save_path) else '.', exist_ok=True) | |
vectorstore.save_local(error_save_path) | |
print(f"Partial vector store saved to {error_save_path}") | |
raise | |
os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True) | |
vectorstore.save_local(save_path) | |
print(f"Vector store saved to {save_path}") | |
return vectorstore | |
def load_vector_store(embeddings, load_path="vector_db"): | |
if not os.path.exists(load_path): | |
raise FileNotFoundError(f"Cannot find vector store: {load_path}") | |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Builds a vector store") | |
parser.add_argument("--folder", type=str, default="dataset", help="Path to the folder containing the documents") | |
parser.add_argument("--save_path", type=str, default="vector_db", help="Path to save the vector store") | |
parser.add_argument("--batch_size", type=int, default=16, help="Batch size") | |
parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Name of the embedding model") | |
parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to use ('cuda' or 'cpu')") | |
args = parser.parse_args() | |
# Import the document processing module | |
from document_processor import load_documents, split_documents | |
# Load and split documents | |
documents = load_documents(args.folder) | |
chunks = split_documents(documents, chunk_size=800, chunk_overlap=100) | |
# Load the embedding model | |
embeddings = get_embeddings(model_name=args.model_name, device=args.device) | |
# Build the vector store | |
build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size) |