Spaces:

kamkol
/

AB_AI_v3

Running on CPU Upgrade

App Files Files Community

AB_AI_v3 / process_data.py

kamkol

Combine both working Streamlit apps into one combined Streamlit app

03375c9 3 months ago

raw

history blame contribute delete

8.83 kB

	import os
	import json
	import shutil
	import pickle
	import time
	from pathlib import Path
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Qdrant
	from langchain_community.retrievers import BM25Retriever
	from rank_bm25 import BM25Okapi
	import numpy as np
	from langchain.retrievers import EnsembleRetriever
	from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
	from langchain_cohere import CohereRerank

	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	def clean_directory(directory_path):
	"""Clean a directory by removing all files and subdirectories"""
	path = Path(directory_path)
	if path.exists():
	print(f"Cleaning directory: {directory_path}")
	shutil.rmtree(path)

	# Wait a moment to ensure OS releases the directory handles
	time.sleep(1)

	path.mkdir(parents=True, exist_ok=True)
	print(f"Created clean directory: {directory_path}")

	def load_preprocessed_chunks():
	"""Load the preprocessed chunks from JSON file"""
	print("Loading preprocessed chunks...")

	# Path to the saved JSON file
	chunks_file = "all_chunks_95percentile.json"
	if not os.path.exists(chunks_file):
	raise FileNotFoundError(f"Chunks file not found: {chunks_file}")

	with open(chunks_file, "r", encoding="utf-8") as f:
	chunks_data = json.load(f)

	# Convert to Document objects
	documents = [
	Document(
	page_content=chunk['page_content'],
	metadata=chunk['metadata']
	)
	for chunk in chunks_data
	]

	print(f"Loaded {len(documents)} preprocessed chunks.")
	return documents

	def create_vectorstore_and_retrievers(documents):
	"""Create vectorstore and retrievers using the latest chunking strategy."""

	try:
	# Initialize embedding model
	print("Loading embedding model...")
	embedding_model = HuggingFaceEmbeddings(
	model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
	)

	# Create Qdrant vectorstore
	print("Creating Qdrant vectorstore...")
	qdrant_vectorstore = Qdrant.from_documents(
	documents,
	embedding_model,
	location=":memory:",
	collection_name="kohavi_ab_testing_pdf_collection",
	)

	# Create BM25 retriever
	print("Creating BM25 retriever...")
	texts = [doc.page_content for doc in documents]
	tokenized_corpus = [text.split() for text in texts]
	bm25 = BM25Okapi(tokenized_corpus)
	bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
	bm25_retriever.k = 10 # Set top-k results

	print(f"Successfully created vectorstore with {len(documents)} documents")
	print(f"BM25 retriever created with {len(texts)} texts")

	return qdrant_vectorstore, bm25_retriever, embedding_model

	except Exception as e:
	print(f"Error creating vectorstore and retrievers: {e}")
	raise

	def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents):
	"""Save all processed data files needed for the app"""
	print("Saving processed data...")

	# Create processed data directory
	processed_data_dir = Path("data/processed_data")
	clean_directory(processed_data_dir)

	# Save documents as chunks
	print("Saving document chunks...")
	with open(processed_data_dir / "chunks.pkl", "wb") as f:
	pickle.dump(documents, f)

	# Save BM25 retriever
	print("Saving BM25 retriever...")
	with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f:
	pickle.dump(bm25_retriever, f)

	# Save embedding model info (we'll reinitialize it in the app)
	print("Saving embedding model info...")
	embedding_info = {
	"model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
	}
	with open(processed_data_dir / "embedding_info.json", "w") as f:
	json.dump(embedding_info, f)

	# Save vector data for Qdrant - we need to extract vectors and metadata
	print("Saving Qdrant vector data...")

	# Get all vectors and their metadata from Qdrant
	vectors_data = []
	for doc in documents:
	# We'll need to re-embed in the app since we can't easily serialize Qdrant's in-memory store
	vectors_data.append({
	"text": doc.page_content,
	"metadata": doc.metadata
	})

	with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f:
	json.dump(vectors_data, f, ensure_ascii=False, indent=2)

	print("All processed data saved successfully!")

	def create_processed_data():
	"""Create all processed data files needed for the RAG system"""

	# Ensure the processed_data directory exists
	processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data")
	processed_data_dir.mkdir(parents=True, exist_ok=True)

	# Load the improved chunks from the Jupyter notebook
	chunks_source_path = Path("all_chunks_95percentile.json")

	if not chunks_source_path.exists():
	raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}")

	print("Loading improved chunks from Jupyter notebook...")
	with open(chunks_source_path, 'r') as f:
	chunk_data = json.load(f)

	# Convert to Document objects
	documents = []
	for chunk in chunk_data:
	doc = Document(
	page_content=chunk['page_content'],
	metadata=chunk['metadata']
	)
	documents.append(doc)

	print(f"Loaded {len(documents)} chunks")

	# Save documents as pickle
	chunks_path = processed_data_dir / "chunks.pkl"
	with open(chunks_path, "wb") as f:
	pickle.dump(documents, f)
	print(f"Saved chunks to {chunks_path}")

	# Create BM25 retriever
	print("Creating BM25 retriever...")
	texts = [doc.page_content for doc in documents]
	tokenized_texts = [text.split() for text in texts]
	bm25 = BM25Okapi(tokenized_texts)

	# Create BM25 retriever object
	from langchain_community.retrievers import BM25Retriever
	bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])

	# Save BM25 retriever
	bm25_path = processed_data_dir / "bm25_retriever.pkl"
	with open(bm25_path, "wb") as f:
	pickle.dump(bm25_retriever, f)
	print(f"Saved BM25 retriever to {bm25_path}")

	# Initialize embedding model
	print("Initializing embedding model...")
	model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
	embedding_model = HuggingFaceEmbeddings(model_name=model_name)

	# Save embedding model info
	embedding_info = {"model_name": model_name}
	embedding_info_path = processed_data_dir / "embedding_info.json"
	with open(embedding_info_path, "w") as f:
	json.dump(embedding_info, f)
	print(f"Saved embedding info to {embedding_info_path}")

	# Pre-compute embeddings for all documents
	print("Pre-computing embeddings (this may take a while)...")
	embedded_docs = []

	# Process in batches to avoid memory issues
	batch_size = 50
	for i in range(0, len(documents), batch_size):
	batch = documents[i:i+batch_size]

	# Extract text
	texts = [doc.page_content for doc in batch]

	# Get embeddings
	embeddings = embedding_model.embed_documents(texts)

	# Store with metadata
	for j, doc in enumerate(batch):
	embedded_docs.append({
	"id": i + j,
	"text": doc.page_content,
	"metadata": doc.metadata,
	"embedding": embeddings[j]
	})

	# Print progress
	print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks")

	# Save the embedded docs for fast loading
	embedded_docs_path = processed_data_dir / "embedded_docs.pkl"
	with open(embedded_docs_path, "wb") as f:
	pickle.dump(embedded_docs, f)
	print(f"Saved embedded docs to {embedded_docs_path}")

	print(f"Processing complete! All files saved to {processed_data_dir}")
	print(f"Files created:")
	print(f" - chunks.pkl ({len(documents)} documents)")
	print(f" - bm25_retriever.pkl")
	print(f" - embedding_info.json")
	print(f" - embedded_docs.pkl ({len(embedded_docs)} embedded documents)")

	if __name__ == "__main__":
	create_processed_data()