AB_AI_v3 / process_data.py
kamkol's picture
Combine both working Streamlit apps into one combined Streamlit app
03375c9
import os
import json
import shutil
import pickle
import time
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_community.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def clean_directory(directory_path):
"""Clean a directory by removing all files and subdirectories"""
path = Path(directory_path)
if path.exists():
print(f"Cleaning directory: {directory_path}")
shutil.rmtree(path)
# Wait a moment to ensure OS releases the directory handles
time.sleep(1)
path.mkdir(parents=True, exist_ok=True)
print(f"Created clean directory: {directory_path}")
def load_preprocessed_chunks():
"""Load the preprocessed chunks from JSON file"""
print("Loading preprocessed chunks...")
# Path to the saved JSON file
chunks_file = "all_chunks_95percentile.json"
if not os.path.exists(chunks_file):
raise FileNotFoundError(f"Chunks file not found: {chunks_file}")
with open(chunks_file, "r", encoding="utf-8") as f:
chunks_data = json.load(f)
# Convert to Document objects
documents = [
Document(
page_content=chunk['page_content'],
metadata=chunk['metadata']
)
for chunk in chunks_data
]
print(f"Loaded {len(documents)} preprocessed chunks.")
return documents
def create_vectorstore_and_retrievers(documents):
"""Create vectorstore and retrievers using the latest chunking strategy."""
try:
# Initialize embedding model
print("Loading embedding model...")
embedding_model = HuggingFaceEmbeddings(
model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
)
# Create Qdrant vectorstore
print("Creating Qdrant vectorstore...")
qdrant_vectorstore = Qdrant.from_documents(
documents,
embedding_model,
location=":memory:",
collection_name="kohavi_ab_testing_pdf_collection",
)
# Create BM25 retriever
print("Creating BM25 retriever...")
texts = [doc.page_content for doc in documents]
tokenized_corpus = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_corpus)
bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
bm25_retriever.k = 10 # Set top-k results
print(f"Successfully created vectorstore with {len(documents)} documents")
print(f"BM25 retriever created with {len(texts)} texts")
return qdrant_vectorstore, bm25_retriever, embedding_model
except Exception as e:
print(f"Error creating vectorstore and retrievers: {e}")
raise
def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents):
"""Save all processed data files needed for the app"""
print("Saving processed data...")
# Create processed data directory
processed_data_dir = Path("data/processed_data")
clean_directory(processed_data_dir)
# Save documents as chunks
print("Saving document chunks...")
with open(processed_data_dir / "chunks.pkl", "wb") as f:
pickle.dump(documents, f)
# Save BM25 retriever
print("Saving BM25 retriever...")
with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f:
pickle.dump(bm25_retriever, f)
# Save embedding model info (we'll reinitialize it in the app)
print("Saving embedding model info...")
embedding_info = {
"model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
}
with open(processed_data_dir / "embedding_info.json", "w") as f:
json.dump(embedding_info, f)
# Save vector data for Qdrant - we need to extract vectors and metadata
print("Saving Qdrant vector data...")
# Get all vectors and their metadata from Qdrant
vectors_data = []
for doc in documents:
# We'll need to re-embed in the app since we can't easily serialize Qdrant's in-memory store
vectors_data.append({
"text": doc.page_content,
"metadata": doc.metadata
})
with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f:
json.dump(vectors_data, f, ensure_ascii=False, indent=2)
print("All processed data saved successfully!")
def create_processed_data():
"""Create all processed data files needed for the RAG system"""
# Ensure the processed_data directory exists
processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data")
processed_data_dir.mkdir(parents=True, exist_ok=True)
# Load the improved chunks from the Jupyter notebook
chunks_source_path = Path("all_chunks_95percentile.json")
if not chunks_source_path.exists():
raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}")
print("Loading improved chunks from Jupyter notebook...")
with open(chunks_source_path, 'r') as f:
chunk_data = json.load(f)
# Convert to Document objects
documents = []
for chunk in chunk_data:
doc = Document(
page_content=chunk['page_content'],
metadata=chunk['metadata']
)
documents.append(doc)
print(f"Loaded {len(documents)} chunks")
# Save documents as pickle
chunks_path = processed_data_dir / "chunks.pkl"
with open(chunks_path, "wb") as f:
pickle.dump(documents, f)
print(f"Saved chunks to {chunks_path}")
# Create BM25 retriever
print("Creating BM25 retriever...")
texts = [doc.page_content for doc in documents]
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)
# Create BM25 retriever object
from langchain_community.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
# Save BM25 retriever
bm25_path = processed_data_dir / "bm25_retriever.pkl"
with open(bm25_path, "wb") as f:
pickle.dump(bm25_retriever, f)
print(f"Saved BM25 retriever to {bm25_path}")
# Initialize embedding model
print("Initializing embedding model...")
model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
# Save embedding model info
embedding_info = {"model_name": model_name}
embedding_info_path = processed_data_dir / "embedding_info.json"
with open(embedding_info_path, "w") as f:
json.dump(embedding_info, f)
print(f"Saved embedding info to {embedding_info_path}")
# Pre-compute embeddings for all documents
print("Pre-computing embeddings (this may take a while)...")
embedded_docs = []
# Process in batches to avoid memory issues
batch_size = 50
for i in range(0, len(documents), batch_size):
batch = documents[i:i+batch_size]
# Extract text
texts = [doc.page_content for doc in batch]
# Get embeddings
embeddings = embedding_model.embed_documents(texts)
# Store with metadata
for j, doc in enumerate(batch):
embedded_docs.append({
"id": i + j,
"text": doc.page_content,
"metadata": doc.metadata,
"embedding": embeddings[j]
})
# Print progress
print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks")
# Save the embedded docs for fast loading
embedded_docs_path = processed_data_dir / "embedded_docs.pkl"
with open(embedded_docs_path, "wb") as f:
pickle.dump(embedded_docs, f)
print(f"Saved embedded docs to {embedded_docs_path}")
print(f"Processing complete! All files saved to {processed_data_dir}")
print(f"Files created:")
print(f" - chunks.pkl ({len(documents)} documents)")
print(f" - bm25_retriever.pkl")
print(f" - embedding_info.json")
print(f" - embedded_docs.pkl ({len(embedded_docs)} embedded documents)")
if __name__ == "__main__":
create_processed_data()