|
import os |
|
import json |
|
import shutil |
|
import pickle |
|
import time |
|
from pathlib import Path |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_core.documents import Document |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import Qdrant |
|
from langchain_community.retrievers import BM25Retriever |
|
from rank_bm25 import BM25Okapi |
|
import numpy as np |
|
from langchain.retrievers import EnsembleRetriever |
|
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever |
|
from langchain_cohere import CohereRerank |
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
def clean_directory(directory_path): |
|
"""Clean a directory by removing all files and subdirectories""" |
|
path = Path(directory_path) |
|
if path.exists(): |
|
print(f"Cleaning directory: {directory_path}") |
|
shutil.rmtree(path) |
|
|
|
|
|
time.sleep(1) |
|
|
|
path.mkdir(parents=True, exist_ok=True) |
|
print(f"Created clean directory: {directory_path}") |
|
|
|
def load_preprocessed_chunks(): |
|
"""Load the preprocessed chunks from JSON file""" |
|
print("Loading preprocessed chunks...") |
|
|
|
|
|
chunks_file = "all_chunks_95percentile.json" |
|
if not os.path.exists(chunks_file): |
|
raise FileNotFoundError(f"Chunks file not found: {chunks_file}") |
|
|
|
with open(chunks_file, "r", encoding="utf-8") as f: |
|
chunks_data = json.load(f) |
|
|
|
|
|
documents = [ |
|
Document( |
|
page_content=chunk['page_content'], |
|
metadata=chunk['metadata'] |
|
) |
|
for chunk in chunks_data |
|
] |
|
|
|
print(f"Loaded {len(documents)} preprocessed chunks.") |
|
return documents |
|
|
|
def create_vectorstore_and_retrievers(documents): |
|
"""Create vectorstore and retrievers using the latest chunking strategy.""" |
|
|
|
try: |
|
|
|
print("Loading embedding model...") |
|
embedding_model = HuggingFaceEmbeddings( |
|
model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec" |
|
) |
|
|
|
|
|
print("Creating Qdrant vectorstore...") |
|
qdrant_vectorstore = Qdrant.from_documents( |
|
documents, |
|
embedding_model, |
|
location=":memory:", |
|
collection_name="kohavi_ab_testing_pdf_collection", |
|
) |
|
|
|
|
|
print("Creating BM25 retriever...") |
|
texts = [doc.page_content for doc in documents] |
|
tokenized_corpus = [text.split() for text in texts] |
|
bm25 = BM25Okapi(tokenized_corpus) |
|
bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents]) |
|
bm25_retriever.k = 10 |
|
|
|
print(f"Successfully created vectorstore with {len(documents)} documents") |
|
print(f"BM25 retriever created with {len(texts)} texts") |
|
|
|
return qdrant_vectorstore, bm25_retriever, embedding_model |
|
|
|
except Exception as e: |
|
print(f"Error creating vectorstore and retrievers: {e}") |
|
raise |
|
|
|
def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents): |
|
"""Save all processed data files needed for the app""" |
|
print("Saving processed data...") |
|
|
|
|
|
processed_data_dir = Path("data/processed_data") |
|
clean_directory(processed_data_dir) |
|
|
|
|
|
print("Saving document chunks...") |
|
with open(processed_data_dir / "chunks.pkl", "wb") as f: |
|
pickle.dump(documents, f) |
|
|
|
|
|
print("Saving BM25 retriever...") |
|
with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f: |
|
pickle.dump(bm25_retriever, f) |
|
|
|
|
|
print("Saving embedding model info...") |
|
embedding_info = { |
|
"model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec" |
|
} |
|
with open(processed_data_dir / "embedding_info.json", "w") as f: |
|
json.dump(embedding_info, f) |
|
|
|
|
|
print("Saving Qdrant vector data...") |
|
|
|
|
|
vectors_data = [] |
|
for doc in documents: |
|
|
|
vectors_data.append({ |
|
"text": doc.page_content, |
|
"metadata": doc.metadata |
|
}) |
|
|
|
with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f: |
|
json.dump(vectors_data, f, ensure_ascii=False, indent=2) |
|
|
|
print("All processed data saved successfully!") |
|
|
|
def create_processed_data(): |
|
"""Create all processed data files needed for the RAG system""" |
|
|
|
|
|
processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data") |
|
processed_data_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
chunks_source_path = Path("all_chunks_95percentile.json") |
|
|
|
if not chunks_source_path.exists(): |
|
raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}") |
|
|
|
print("Loading improved chunks from Jupyter notebook...") |
|
with open(chunks_source_path, 'r') as f: |
|
chunk_data = json.load(f) |
|
|
|
|
|
documents = [] |
|
for chunk in chunk_data: |
|
doc = Document( |
|
page_content=chunk['page_content'], |
|
metadata=chunk['metadata'] |
|
) |
|
documents.append(doc) |
|
|
|
print(f"Loaded {len(documents)} chunks") |
|
|
|
|
|
chunks_path = processed_data_dir / "chunks.pkl" |
|
with open(chunks_path, "wb") as f: |
|
pickle.dump(documents, f) |
|
print(f"Saved chunks to {chunks_path}") |
|
|
|
|
|
print("Creating BM25 retriever...") |
|
texts = [doc.page_content for doc in documents] |
|
tokenized_texts = [text.split() for text in texts] |
|
bm25 = BM25Okapi(tokenized_texts) |
|
|
|
|
|
from langchain_community.retrievers import BM25Retriever |
|
bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents]) |
|
|
|
|
|
bm25_path = processed_data_dir / "bm25_retriever.pkl" |
|
with open(bm25_path, "wb") as f: |
|
pickle.dump(bm25_retriever, f) |
|
print(f"Saved BM25 retriever to {bm25_path}") |
|
|
|
|
|
print("Initializing embedding model...") |
|
model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec" |
|
embedding_model = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
|
|
embedding_info = {"model_name": model_name} |
|
embedding_info_path = processed_data_dir / "embedding_info.json" |
|
with open(embedding_info_path, "w") as f: |
|
json.dump(embedding_info, f) |
|
print(f"Saved embedding info to {embedding_info_path}") |
|
|
|
|
|
print("Pre-computing embeddings (this may take a while)...") |
|
embedded_docs = [] |
|
|
|
|
|
batch_size = 50 |
|
for i in range(0, len(documents), batch_size): |
|
batch = documents[i:i+batch_size] |
|
|
|
|
|
texts = [doc.page_content for doc in batch] |
|
|
|
|
|
embeddings = embedding_model.embed_documents(texts) |
|
|
|
|
|
for j, doc in enumerate(batch): |
|
embedded_docs.append({ |
|
"id": i + j, |
|
"text": doc.page_content, |
|
"metadata": doc.metadata, |
|
"embedding": embeddings[j] |
|
}) |
|
|
|
|
|
print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks") |
|
|
|
|
|
embedded_docs_path = processed_data_dir / "embedded_docs.pkl" |
|
with open(embedded_docs_path, "wb") as f: |
|
pickle.dump(embedded_docs, f) |
|
print(f"Saved embedded docs to {embedded_docs_path}") |
|
|
|
print(f"Processing complete! All files saved to {processed_data_dir}") |
|
print(f"Files created:") |
|
print(f" - chunks.pkl ({len(documents)} documents)") |
|
print(f" - bm25_retriever.pkl") |
|
print(f" - embedding_info.json") |
|
print(f" - embedded_docs.pkl ({len(embedded_docs)} embedded documents)") |
|
|
|
if __name__ == "__main__": |
|
create_processed_data() |