Spaces:
Sleeping
Sleeping
from langchain_openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain_community.vectorstores import Qdrant | |
from qdrant_client import QdrantClient | |
from qdrant_client.http.models import Distance, VectorParams | |
from dotenv import load_dotenv | |
from .questions_agent import workflow | |
load_dotenv() | |
CHUNK_SIZE = 500 | |
CHUNK_OVERLAP = 200 | |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=CHUNK_SIZE, | |
chunk_overlap=CHUNK_OVERLAP | |
) | |
# Step 1: Parse the PDF and Extract Questions | |
class PDFProcessor: | |
def __init__(self, file_path): | |
self.file_path = file_path | |
self.text = "" | |
self.docs = PyMuPDFLoader(self.file_path).load() | |
def extract_text(self): | |
for doc in self.docs: | |
self.text += doc.page_content | |
return self.text | |
def extract_questions(self): | |
questions = [] | |
chunks = text_splitter.split_text(self.extract_text()) | |
config = {"configurable":{"thread_id":1}} | |
#state = workflow.get_state(config=config).values | |
question_sets = workflow.batch(config=config, inputs=[{"context":chunk, "previous_questions":[]} for chunk in chunks]) | |
for item in question_sets: | |
questions.extend(item.get("previous_questions", [])) | |
return questions | |
# Step 2: Split Questions and Prepare for Vector Database | |
class QuestionIngestor: | |
def __init__(self, questions): | |
self.questions = questions | |
def split_questions(self): | |
# Using a Text Splitter to handle long questions | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=20 | |
) | |
docs = splitter.create_documents(self.questions) | |
return docs | |
# Step 3: Setup Qdrant Vector Store and Index Data | |
class QdrantSetup: | |
def __init__(self, questions): | |
self.questions = questions | |
self.qdrant_client = QdrantClient("localhost", port=6333) | |
self.embedding = OpenAIEmbeddings() | |
def setup_qdrant(self): | |
# Create a Qdrant collection for questions | |
self.qdrant_client.recreate_collection( | |
collection_name="questions", | |
vectors_config=VectorParams(size=1536, distance=Distance.COSINE), | |
) | |
def index_questions(self): | |
# Index questions into Qdrant | |
qdrant_vectorstore = Qdrant( | |
client=self.qdrant_client, | |
collection_name="questions", | |
embedding=self.embedding | |
) | |
qdrant_vectorstore.add_documents(self.questions) | |
# USE Parent Document Embedding for categorization |