AidMateLLM / RAG /RAG_Retrival.py
TahaFawzyElshrif
published first version
2ebf9ad
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdmk
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate
from langchain.document_loaders import PyPDFLoader
import pandas as pd
import duckdb
import numpy as np
import os
class RAG_Retrival:
def __init__(self,db,model,embedder):
self.conn = db
self.model = model
self.embedder = embedder
def read_data(self,path_data):
# Count total files first for tqdm's total
total_files = sum(len(files) for _, _, files in os.walk(path_data))
all_text = ""
with tqdm(total=total_files, desc="Reading files", unit="file") as pbar:
for root, dirs, files in os.walk(path_data):
for file in files:
full_path = os.path.join(root, file)
if full_path.endswith(".txt"):
all_text += self.load_text_file(full_path)
elif full_path.endswith(".pdf"):
all_text += self.load_pdf(full_path)
pbar.update(1)
return all_text
def load_text_file(self,path):
text = ""
with open(path, 'r') as file:
for line in file:
text += line
return text
def load_pdf(self,pdf_folder):
loader = PyPDFLoader(pdf_folder)
pages = loader.load_and_split()
text = "\n".join([doc.page_content for doc in pages])
return text
def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=is_separator_regex,
)
docs = text_splitter.create_documents([text])
for i, d in enumerate(docs):
d.metadata = {"doc_id": i}
return docs
def prepare_text_df(self,docs):
# Get the page_content from the documents and create a new list
content_list = [doc.page_content for doc in docs]
# Send one page_content at a time
print("Making embedding...")
embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)]
print("Finished embedding...")
# Create a dataframe to ingest it to the database
dataframe = pd.DataFrame({
'page_content': content_list,
'embeddings': embeddings})
return dataframe
def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
all_texts = self.read_data(path)
docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex)
dataframe = self.prepare_text_df(docs)
self.upload_file(dataframe)
return dataframe
def upload_file(self,embedding_df,name='first_aid'):
'''
Upload data and close database to be commited
'''
self.conn.make_data_frame(embedding_df,name)
self.conn.close()
def query_relevant(self,user_query):
embedded_query = self.embedder.embed(user_query)
result = self.conn.get_relevant_docs(embedded_query)
return result