File size: 8,830 Bytes
03375c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import os
import json
import shutil
import pickle
import time
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_community.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def clean_directory(directory_path):
    """Clean a directory by removing all files and subdirectories"""
    path = Path(directory_path)
    if path.exists():
        print(f"Cleaning directory: {directory_path}")
        shutil.rmtree(path)
    
    # Wait a moment to ensure OS releases the directory handles
    time.sleep(1)
    
    path.mkdir(parents=True, exist_ok=True)
    print(f"Created clean directory: {directory_path}")

def load_preprocessed_chunks():
    """Load the preprocessed chunks from JSON file"""
    print("Loading preprocessed chunks...")
    
    # Path to the saved JSON file
    chunks_file = "all_chunks_95percentile.json"
    if not os.path.exists(chunks_file):
        raise FileNotFoundError(f"Chunks file not found: {chunks_file}")
    
    with open(chunks_file, "r", encoding="utf-8") as f:
        chunks_data = json.load(f)
    
    # Convert to Document objects
    documents = [
        Document(
            page_content=chunk['page_content'], 
            metadata=chunk['metadata']
        ) 
        for chunk in chunks_data
    ]
    
    print(f"Loaded {len(documents)} preprocessed chunks.")
    return documents

def create_vectorstore_and_retrievers(documents):
    """Create vectorstore and retrievers using the latest chunking strategy."""
    
    try:
        # Initialize embedding model
        print("Loading embedding model...")
        embedding_model = HuggingFaceEmbeddings(
            model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
        )
        
        # Create Qdrant vectorstore
        print("Creating Qdrant vectorstore...")
        qdrant_vectorstore = Qdrant.from_documents(
            documents,
            embedding_model,
            location=":memory:",
            collection_name="kohavi_ab_testing_pdf_collection",
        )
        
        # Create BM25 retriever
        print("Creating BM25 retriever...")
        texts = [doc.page_content for doc in documents]
        tokenized_corpus = [text.split() for text in texts]
        bm25 = BM25Okapi(tokenized_corpus)
        bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
        bm25_retriever.k = 10  # Set top-k results
        
        print(f"Successfully created vectorstore with {len(documents)} documents")
        print(f"BM25 retriever created with {len(texts)} texts")
        
        return qdrant_vectorstore, bm25_retriever, embedding_model
        
    except Exception as e:
        print(f"Error creating vectorstore and retrievers: {e}")
        raise

def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents):
    """Save all processed data files needed for the app"""
    print("Saving processed data...")
    
    # Create processed data directory
    processed_data_dir = Path("data/processed_data")
    clean_directory(processed_data_dir)
    
    # Save documents as chunks
    print("Saving document chunks...")
    with open(processed_data_dir / "chunks.pkl", "wb") as f:
        pickle.dump(documents, f)
    
    # Save BM25 retriever
    print("Saving BM25 retriever...")
    with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f:
        pickle.dump(bm25_retriever, f)
    
    # Save embedding model info (we'll reinitialize it in the app)
    print("Saving embedding model info...")
    embedding_info = {
        "model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
    }
    with open(processed_data_dir / "embedding_info.json", "w") as f:
        json.dump(embedding_info, f)
    
    # Save vector data for Qdrant - we need to extract vectors and metadata
    print("Saving Qdrant vector data...")
    
    # Get all vectors and their metadata from Qdrant
    vectors_data = []
    for doc in documents:
        # We'll need to re-embed in the app since we can't easily serialize Qdrant's in-memory store
        vectors_data.append({
            "text": doc.page_content,
            "metadata": doc.metadata
        })
    
    with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f:
        json.dump(vectors_data, f, ensure_ascii=False, indent=2)
    
    print("All processed data saved successfully!")

def create_processed_data():
    """Create all processed data files needed for the RAG system"""
    
    # Ensure the processed_data directory exists
    processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data")
    processed_data_dir.mkdir(parents=True, exist_ok=True)
    
    # Load the improved chunks from the Jupyter notebook
    chunks_source_path = Path("all_chunks_95percentile.json")
    
    if not chunks_source_path.exists():
        raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}")
    
    print("Loading improved chunks from Jupyter notebook...")
    with open(chunks_source_path, 'r') as f:
        chunk_data = json.load(f)
    
    # Convert to Document objects
    documents = []
    for chunk in chunk_data:
        doc = Document(
            page_content=chunk['page_content'],
            metadata=chunk['metadata']
        )
        documents.append(doc)
    
    print(f"Loaded {len(documents)} chunks")
    
    # Save documents as pickle
    chunks_path = processed_data_dir / "chunks.pkl"
    with open(chunks_path, "wb") as f:
        pickle.dump(documents, f)
    print(f"Saved chunks to {chunks_path}")
    
    # Create BM25 retriever
    print("Creating BM25 retriever...")
    texts = [doc.page_content for doc in documents]
    tokenized_texts = [text.split() for text in texts]
    bm25 = BM25Okapi(tokenized_texts)
    
    # Create BM25 retriever object
    from langchain_community.retrievers import BM25Retriever
    bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
    
    # Save BM25 retriever
    bm25_path = processed_data_dir / "bm25_retriever.pkl"
    with open(bm25_path, "wb") as f:
        pickle.dump(bm25_retriever, f)
    print(f"Saved BM25 retriever to {bm25_path}")
    
    # Initialize embedding model
    print("Initializing embedding model...")
    model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    
    # Save embedding model info
    embedding_info = {"model_name": model_name}
    embedding_info_path = processed_data_dir / "embedding_info.json"
    with open(embedding_info_path, "w") as f:
        json.dump(embedding_info, f)
    print(f"Saved embedding info to {embedding_info_path}")
    
    # Pre-compute embeddings for all documents
    print("Pre-computing embeddings (this may take a while)...")
    embedded_docs = []
    
    # Process in batches to avoid memory issues
    batch_size = 50
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        
        # Extract text
        texts = [doc.page_content for doc in batch]
        
        # Get embeddings
        embeddings = embedding_model.embed_documents(texts)
        
        # Store with metadata
        for j, doc in enumerate(batch):
            embedded_docs.append({
                "id": i + j,
                "text": doc.page_content,
                "metadata": doc.metadata,
                "embedding": embeddings[j]
            })
        
        # Print progress
        print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks")
    
    # Save the embedded docs for fast loading
    embedded_docs_path = processed_data_dir / "embedded_docs.pkl"
    with open(embedded_docs_path, "wb") as f:
        pickle.dump(embedded_docs, f)
    print(f"Saved embedded docs to {embedded_docs_path}")
    
    print(f"Processing complete! All files saved to {processed_data_dir}")
    print(f"Files created:")
    print(f"  - chunks.pkl ({len(documents)} documents)")
    print(f"  - bm25_retriever.pkl")
    print(f"  - embedding_info.json")
    print(f"  - embedded_docs.pkl ({len(embedded_docs)} embedded documents)")

if __name__ == "__main__":
    create_processed_data()