from tqdm import tqdm from tqdm.notebook import tqdm as tqdmk from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.llms import HuggingFaceHub from langchain import PromptTemplate from langchain.document_loaders import PyPDFLoader import pandas as pd import duckdb import numpy as np import os class RAG_Retrival: def __init__(self,db,model,embedder): self.conn = db self.model = model self.embedder = embedder def read_data(self,path_data): # Count total files first for tqdm's total total_files = sum(len(files) for _, _, files in os.walk(path_data)) all_text = "" with tqdm(total=total_files, desc="Reading files", unit="file") as pbar: for root, dirs, files in os.walk(path_data): for file in files: full_path = os.path.join(root, file) if full_path.endswith(".txt"): all_text += self.load_text_file(full_path) elif full_path.endswith(".pdf"): all_text += self.load_pdf(full_path) pbar.update(1) return all_text def load_text_file(self,path): text = "" with open(path, 'r') as file: for line in file: text += line return text def load_pdf(self,pdf_folder): loader = PyPDFLoader(pdf_folder) pages = loader.load_and_split() text = "\n".join([doc.page_content for doc in pages]) return text def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=is_separator_regex, ) docs = text_splitter.create_documents([text]) for i, d in enumerate(docs): d.metadata = {"doc_id": i} return docs def prepare_text_df(self,docs): # Get the page_content from the documents and create a new list content_list = [doc.page_content for doc in docs] # Send one page_content at a time print("Making embedding...") embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)] print("Finished embedding...") # Create a dataframe to ingest it to the database dataframe = pd.DataFrame({ 'page_content': content_list, 'embeddings': embeddings}) return dataframe def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False): all_texts = self.read_data(path) docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex) dataframe = self.prepare_text_df(docs) self.upload_file(dataframe) return dataframe def upload_file(self,embedding_df,name='first_aid'): ''' Upload data and close database to be commited ''' self.conn.make_data_frame(embedding_df,name) self.conn.close() def query_relevant(self,user_query): embedded_query = self.embedder.embed(user_query) result = self.conn.get_relevant_docs(embedded_query) return result