import os import json import shutil import pickle import time from pathlib import Path from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Qdrant from langchain_community.retrievers import BM25Retriever from rank_bm25 import BM25Okapi import numpy as np from langchain.retrievers import EnsembleRetriever from langchain.retrievers.contextual_compression import ContextualCompressionRetriever from langchain_cohere import CohereRerank from dotenv import load_dotenv # Load environment variables load_dotenv() def clean_directory(directory_path): """Clean a directory by removing all files and subdirectories""" path = Path(directory_path) if path.exists(): print(f"Cleaning directory: {directory_path}") shutil.rmtree(path) # Wait a moment to ensure OS releases the directory handles time.sleep(1) path.mkdir(parents=True, exist_ok=True) print(f"Created clean directory: {directory_path}") def load_preprocessed_chunks(): """Load the preprocessed chunks from JSON file""" print("Loading preprocessed chunks...") # Path to the saved JSON file chunks_file = "all_chunks_95percentile.json" if not os.path.exists(chunks_file): raise FileNotFoundError(f"Chunks file not found: {chunks_file}") with open(chunks_file, "r", encoding="utf-8") as f: chunks_data = json.load(f) # Convert to Document objects documents = [ Document( page_content=chunk['page_content'], metadata=chunk['metadata'] ) for chunk in chunks_data ] print(f"Loaded {len(documents)} preprocessed chunks.") return documents def create_vectorstore_and_retrievers(documents): """Create vectorstore and retrievers using the latest chunking strategy.""" try: # Initialize embedding model print("Loading embedding model...") embedding_model = HuggingFaceEmbeddings( model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec" ) # Create Qdrant vectorstore print("Creating Qdrant vectorstore...") qdrant_vectorstore = Qdrant.from_documents( documents, embedding_model, location=":memory:", collection_name="kohavi_ab_testing_pdf_collection", ) # Create BM25 retriever print("Creating BM25 retriever...") texts = [doc.page_content for doc in documents] tokenized_corpus = [text.split() for text in texts] bm25 = BM25Okapi(tokenized_corpus) bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents]) bm25_retriever.k = 10 # Set top-k results print(f"Successfully created vectorstore with {len(documents)} documents") print(f"BM25 retriever created with {len(texts)} texts") return qdrant_vectorstore, bm25_retriever, embedding_model except Exception as e: print(f"Error creating vectorstore and retrievers: {e}") raise def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents): """Save all processed data files needed for the app""" print("Saving processed data...") # Create processed data directory processed_data_dir = Path("data/processed_data") clean_directory(processed_data_dir) # Save documents as chunks print("Saving document chunks...") with open(processed_data_dir / "chunks.pkl", "wb") as f: pickle.dump(documents, f) # Save BM25 retriever print("Saving BM25 retriever...") with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f: pickle.dump(bm25_retriever, f) # Save embedding model info (we'll reinitialize it in the app) print("Saving embedding model info...") embedding_info = { "model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec" } with open(processed_data_dir / "embedding_info.json", "w") as f: json.dump(embedding_info, f) # Save vector data for Qdrant - we need to extract vectors and metadata print("Saving Qdrant vector data...") # Get all vectors and their metadata from Qdrant vectors_data = [] for doc in documents: # We'll need to re-embed in the app since we can't easily serialize Qdrant's in-memory store vectors_data.append({ "text": doc.page_content, "metadata": doc.metadata }) with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f: json.dump(vectors_data, f, ensure_ascii=False, indent=2) print("All processed data saved successfully!") def create_processed_data(): """Create all processed data files needed for the RAG system""" # Ensure the processed_data directory exists processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data") processed_data_dir.mkdir(parents=True, exist_ok=True) # Load the improved chunks from the Jupyter notebook chunks_source_path = Path("all_chunks_95percentile.json") if not chunks_source_path.exists(): raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}") print("Loading improved chunks from Jupyter notebook...") with open(chunks_source_path, 'r') as f: chunk_data = json.load(f) # Convert to Document objects documents = [] for chunk in chunk_data: doc = Document( page_content=chunk['page_content'], metadata=chunk['metadata'] ) documents.append(doc) print(f"Loaded {len(documents)} chunks") # Save documents as pickle chunks_path = processed_data_dir / "chunks.pkl" with open(chunks_path, "wb") as f: pickle.dump(documents, f) print(f"Saved chunks to {chunks_path}") # Create BM25 retriever print("Creating BM25 retriever...") texts = [doc.page_content for doc in documents] tokenized_texts = [text.split() for text in texts] bm25 = BM25Okapi(tokenized_texts) # Create BM25 retriever object from langchain_community.retrievers import BM25Retriever bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents]) # Save BM25 retriever bm25_path = processed_data_dir / "bm25_retriever.pkl" with open(bm25_path, "wb") as f: pickle.dump(bm25_retriever, f) print(f"Saved BM25 retriever to {bm25_path}") # Initialize embedding model print("Initializing embedding model...") model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec" embedding_model = HuggingFaceEmbeddings(model_name=model_name) # Save embedding model info embedding_info = {"model_name": model_name} embedding_info_path = processed_data_dir / "embedding_info.json" with open(embedding_info_path, "w") as f: json.dump(embedding_info, f) print(f"Saved embedding info to {embedding_info_path}") # Pre-compute embeddings for all documents print("Pre-computing embeddings (this may take a while)...") embedded_docs = [] # Process in batches to avoid memory issues batch_size = 50 for i in range(0, len(documents), batch_size): batch = documents[i:i+batch_size] # Extract text texts = [doc.page_content for doc in batch] # Get embeddings embeddings = embedding_model.embed_documents(texts) # Store with metadata for j, doc in enumerate(batch): embedded_docs.append({ "id": i + j, "text": doc.page_content, "metadata": doc.metadata, "embedding": embeddings[j] }) # Print progress print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks") # Save the embedded docs for fast loading embedded_docs_path = processed_data_dir / "embedded_docs.pkl" with open(embedded_docs_path, "wb") as f: pickle.dump(embedded_docs, f) print(f"Saved embedded docs to {embedded_docs_path}") print(f"Processing complete! All files saved to {processed_data_dir}") print(f"Files created:") print(f" - chunks.pkl ({len(documents)} documents)") print(f" - bm25_retriever.pkl") print(f" - embedding_info.json") print(f" - embedded_docs.pkl ({len(embedded_docs)} embedded documents)") if __name__ == "__main__": create_processed_data()