aie4-final / backend /app /upload_pdf /ingest_documents.py
richlai's picture
add files
8b1e853
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from dotenv import load_dotenv
from .questions_agent import workflow
load_dotenv()
CHUNK_SIZE = 500
CHUNK_OVERLAP = 200
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
# Step 1: Parse the PDF and Extract Questions
class PDFProcessor:
def __init__(self, file_path):
self.file_path = file_path
self.text = ""
self.docs = PyMuPDFLoader(self.file_path).load()
def extract_text(self):
for doc in self.docs:
self.text += doc.page_content
return self.text
def extract_questions(self):
questions = []
chunks = text_splitter.split_text(self.extract_text())
config = {"configurable":{"thread_id":1}}
#state = workflow.get_state(config=config).values
question_sets = workflow.batch(config=config, inputs=[{"context":chunk, "previous_questions":[]} for chunk in chunks])
for item in question_sets:
questions.extend(item.get("previous_questions", []))
return questions
# Step 2: Split Questions and Prepare for Vector Database
class QuestionIngestor:
def __init__(self, questions):
self.questions = questions
def split_questions(self):
# Using a Text Splitter to handle long questions
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=20
)
docs = splitter.create_documents(self.questions)
return docs
# Step 3: Setup Qdrant Vector Store and Index Data
class QdrantSetup:
def __init__(self, questions):
self.questions = questions
self.qdrant_client = QdrantClient("localhost", port=6333)
self.embedding = OpenAIEmbeddings()
def setup_qdrant(self):
# Create a Qdrant collection for questions
self.qdrant_client.recreate_collection(
collection_name="questions",
vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)
def index_questions(self):
# Index questions into Qdrant
qdrant_vectorstore = Qdrant(
client=self.qdrant_client,
collection_name="questions",
embedding=self.embedding
)
qdrant_vectorstore.add_documents(self.questions)
# USE Parent Document Embedding for categorization