File size: 2,752 Bytes
8b1e853
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from dotenv import load_dotenv
from .questions_agent import workflow
load_dotenv()

CHUNK_SIZE = 500
CHUNK_OVERLAP = 200

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

# Step 1: Parse the PDF and Extract Questions
class PDFProcessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.text = ""
        self.docs = PyMuPDFLoader(self.file_path).load()

    def extract_text(self): 
        for doc in self.docs:
            self.text += doc.page_content
        return self.text

    def extract_questions(self):
        questions = []
        chunks = text_splitter.split_text(self.extract_text())
    
        config = {"configurable":{"thread_id":1}}
        #state = workflow.get_state(config=config).values
        question_sets = workflow.batch(config=config, inputs=[{"context":chunk, "previous_questions":[]} for chunk in chunks])

        for item in question_sets:
            questions.extend(item.get("previous_questions", []))
            
        return questions


# Step 2: Split Questions and Prepare for Vector Database
class QuestionIngestor:
    def __init__(self, questions):
        self.questions = questions

    def split_questions(self):
        # Using a Text Splitter to handle long questions
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=20
        )
        docs = splitter.create_documents(self.questions)
        return docs

# Step 3: Setup Qdrant Vector Store and Index Data
class QdrantSetup:
    def __init__(self, questions):
        self.questions = questions
        self.qdrant_client = QdrantClient("localhost", port=6333)
        self.embedding = OpenAIEmbeddings()

    def setup_qdrant(self):
        # Create a Qdrant collection for questions
        self.qdrant_client.recreate_collection(
            collection_name="questions",
            vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
        )

    def index_questions(self):
        # Index questions into Qdrant
        qdrant_vectorstore = Qdrant(
            client=self.qdrant_client,
            collection_name="questions",
            embedding=self.embedding
        )
        qdrant_vectorstore.add_documents(self.questions)
# USE Parent Document Embedding for categorization