Spaces:

kamkol
/

AB_AI_v3

Running on CPU Upgrade

File size: 8,830 Bytes

03375c9

import os
import json
import shutil
import pickle
import time
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_community.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def clean_directory(directory_path):
    """Clean a directory by removing all files and subdirectories"""
    path = Path(directory_path)
    if path.exists():
        print(f"Cleaning directory: {directory_path}")
        shutil.rmtree(path)
    
    # Wait a moment to ensure OS releases the directory handles
    time.sleep(1)
    
    path.mkdir(parents=True, exist_ok=True)
    print(f"Created clean directory: {directory_path}")

def load_preprocessed_chunks():
    """Load the preprocessed chunks from JSON file"""
    print("Loading preprocessed chunks...")
    
    # Path to the saved JSON file
    chunks_file = "all_chunks_95percentile.json"
    if not os.path.exists(chunks_file):
        raise FileNotFoundError(f"Chunks file not found: {chunks_file}")
    
    with open(chunks_file, "r", encoding="utf-8") as f:
        chunks_data = json.load(f)
    
    # Convert to Document objects
    documents = [
        Document(
            page_content=chunk['page_content'], 
            metadata=chunk['metadata']
        ) 
        for chunk in chunks_data
    ]
    
    print(f"Loaded {len(documents)} preprocessed chunks.")
    return documents

def create_vectorstore_and_retrievers(documents):
    """Create vectorstore and retrievers using the latest chunking strategy."""
    
    try:
        # Initialize embedding model
        print("Loading embedding model...")
        embedding_model = HuggingFaceEmbeddings(
            model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
        )
        
        # Create Qdrant vectorstore
        print("Creating Qdrant vectorstore...")
        qdrant_vectorstore = Qdrant.from_documents(
            documents,
            embedding_model,
            location=":memory:",
            collection_name="kohavi_ab_testing_pdf_collection",
        )
        
        # Create BM25 retriever
        print("Creating BM25 retriever...")
        texts = [doc.page_content for doc in documents]
        tokenized_corpus = [text.split() for text in texts]
        bm25 = BM25Okapi(tokenized_corpus)
        bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
        bm25_retriever.k = 10  # Set top-k results
        
        print(f"Successfully created vectorstore with {len(documents)} documents")
        print(f"BM25 retriever created with {len(texts)} texts")
        
        return qdrant_vectorstore, bm25_retriever, embedding_model
        
    except Exception as e:
        print(f"Error creating vectorstore and retrievers: {e}")
        raise

def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents):
    """Save all processed data files needed for the app"""
    print("Saving processed data...")
    
    # Create processed data directory
    processed_data_dir = Path("data/processed_data")
    clean_directory(processed_data_dir)
    
    # Save documents as chunks
    print("Saving document chunks...")
    with open(processed_data_dir / "chunks.pkl", "wb") as f:
        pickle.dump(documents, f)
    
    # Save BM25 retriever
    print("Saving BM25 retriever...")
    with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f:
        pickle.dump(bm25_retriever, f)
    
    # Save embedding model info (we'll reinitialize it in the app)
    print("Saving embedding model info...")
    embedding_info = {
        "model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
    }
    with open(processed_data_dir / "embedding_info.json", "w") as f:
        json.dump(embedding_info, f)
    
    # Save vector data for Qdrant - we need to extract vectors and metadata
    print("Saving Qdrant vector data...")
    
    # Get all vectors and their metadata from Qdrant
    vectors_data = []
    for doc in documents:
        # We'll need to re-embed in the app since we can't easily serialize Qdrant's in-memory store
        vectors_data.append({
            "text": doc.page_content,
            "metadata": doc.metadata
        })
    
    with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f:
        json.dump(vectors_data, f, ensure_ascii=False, indent=2)
    
    print("All processed data saved successfully!")

def create_processed_data():
    """Create all processed data files needed for the RAG system"""
    
    # Ensure the processed_data directory exists
    processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data")
    processed_data_dir.mkdir(parents=True, exist_ok=True)
    
    # Load the improved chunks from the Jupyter notebook
    chunks_source_path = Path("all_chunks_95percentile.json")
    
    if not chunks_source_path.exists():
        raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}")
    
    print("Loading improved chunks from Jupyter notebook...")
    with open(chunks_source_path, 'r') as f:
        chunk_data = json.load(f)
    
    # Convert to Document objects
    documents = []
    for chunk in chunk_data:
        doc = Document(
            page_content=chunk['page_content'],
            metadata=chunk['metadata']
        )
        documents.append(doc)
    
    print(f"Loaded {len(documents)} chunks")
    
    # Save documents as pickle
    chunks_path = processed_data_dir / "chunks.pkl"
    with open(chunks_path, "wb") as f:
        pickle.dump(documents, f)
    print(f"Saved chunks to {chunks_path}")
    
    # Create BM25 retriever
    print("Creating BM25 retriever...")
    texts = [doc.page_content for doc in documents]
    tokenized_texts = [text.split() for text in texts]
    bm25 = BM25Okapi(tokenized_texts)
    
    # Create BM25 retriever object
    from langchain_community.retrievers import BM25Retriever
    bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
    
    # Save BM25 retriever
    bm25_path = processed_data_dir / "bm25_retriever.pkl"
    with open(bm25_path, "wb") as f:
        pickle.dump(bm25_retriever, f)
    print(f"Saved BM25 retriever to {bm25_path}")
    
    # Initialize embedding model
    print("Initializing embedding model...")
    model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    
    # Save embedding model info
    embedding_info = {"model_name": model_name}
    embedding_info_path = processed_data_dir / "embedding_info.json"
    with open(embedding_info_path, "w") as f:
        json.dump(embedding_info, f)
    print(f"Saved embedding info to {embedding_info_path}")
    
    # Pre-compute embeddings for all documents
    print("Pre-computing embeddings (this may take a while)...")
    embedded_docs = []
    
    # Process in batches to avoid memory issues
    batch_size = 50
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        
        # Extract text
        texts = [doc.page_content for doc in batch]
        
        # Get embeddings
        embeddings = embedding_model.embed_documents(texts)
        
        # Store with metadata
        for j, doc in enumerate(batch):
            embedded_docs.append({
                "id": i + j,
                "text": doc.page_content,
                "metadata": doc.metadata,
                "embedding": embeddings[j]
            })
        
        # Print progress
        print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks")
    
    # Save the embedded docs for fast loading
    embedded_docs_path = processed_data_dir / "embedded_docs.pkl"
    with open(embedded_docs_path, "wb") as f:
        pickle.dump(embedded_docs, f)
    print(f"Saved embedded docs to {embedded_docs_path}")
    
    print(f"Processing complete! All files saved to {processed_data_dir}")
    print(f"Files created:")
    print(f"  - chunks.pkl ({len(documents)} documents)")
    print(f"  - bm25_retriever.pkl")
    print(f"  - embedding_info.json")
    print(f"  - embedded_docs.pkl ({len(embedded_docs)} embedded documents)")

if __name__ == "__main__":
    create_processed_data()