from langchain_openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.vectorstores import Qdrant from qdrant_client import QdrantClient from qdrant_client.http.models import Distance, VectorParams from dotenv import load_dotenv from .questions_agent import workflow load_dotenv() CHUNK_SIZE = 500 CHUNK_OVERLAP = 200 embeddings = OpenAIEmbeddings(model="text-embedding-3-small") text_splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP ) # Step 1: Parse the PDF and Extract Questions class PDFProcessor: def __init__(self, file_path): self.file_path = file_path self.text = "" self.docs = PyMuPDFLoader(self.file_path).load() def extract_text(self): for doc in self.docs: self.text += doc.page_content return self.text def extract_questions(self): questions = [] chunks = text_splitter.split_text(self.extract_text()) config = {"configurable":{"thread_id":1}} #state = workflow.get_state(config=config).values question_sets = workflow.batch(config=config, inputs=[{"context":chunk, "previous_questions":[]} for chunk in chunks]) for item in question_sets: questions.extend(item.get("previous_questions", [])) return questions # Step 2: Split Questions and Prepare for Vector Database class QuestionIngestor: def __init__(self, questions): self.questions = questions def split_questions(self): # Using a Text Splitter to handle long questions splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=20 ) docs = splitter.create_documents(self.questions) return docs # Step 3: Setup Qdrant Vector Store and Index Data class QdrantSetup: def __init__(self, questions): self.questions = questions self.qdrant_client = QdrantClient("localhost", port=6333) self.embedding = OpenAIEmbeddings() def setup_qdrant(self): # Create a Qdrant collection for questions self.qdrant_client.recreate_collection( collection_name="questions", vectors_config=VectorParams(size=1536, distance=Distance.COSINE), ) def index_questions(self): # Index questions into Qdrant qdrant_vectorstore = Qdrant( client=self.qdrant_client, collection_name="questions", embedding=self.embedding ) qdrant_vectorstore.add_documents(self.questions) # USE Parent Document Embedding for categorization