Spaces:
Sleeping
Sleeping
from tqdm import tqdm | |
from tqdm.notebook import tqdm as tqdmk | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.llms import HuggingFaceHub | |
from langchain import PromptTemplate | |
from langchain.document_loaders import PyPDFLoader | |
import pandas as pd | |
import duckdb | |
import numpy as np | |
import os | |
class RAG_Retrival: | |
def __init__(self,db,model,embedder): | |
self.conn = db | |
self.model = model | |
self.embedder = embedder | |
def read_data(self,path_data): | |
# Count total files first for tqdm's total | |
total_files = sum(len(files) for _, _, files in os.walk(path_data)) | |
all_text = "" | |
with tqdm(total=total_files, desc="Reading files", unit="file") as pbar: | |
for root, dirs, files in os.walk(path_data): | |
for file in files: | |
full_path = os.path.join(root, file) | |
if full_path.endswith(".txt"): | |
all_text += self.load_text_file(full_path) | |
elif full_path.endswith(".pdf"): | |
all_text += self.load_pdf(full_path) | |
pbar.update(1) | |
return all_text | |
def load_text_file(self,path): | |
text = "" | |
with open(path, 'r') as file: | |
for line in file: | |
text += line | |
return text | |
def load_pdf(self,pdf_folder): | |
loader = PyPDFLoader(pdf_folder) | |
pages = loader.load_and_split() | |
text = "\n".join([doc.page_content for doc in pages]) | |
return text | |
def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=len, | |
is_separator_regex=is_separator_regex, | |
) | |
docs = text_splitter.create_documents([text]) | |
for i, d in enumerate(docs): | |
d.metadata = {"doc_id": i} | |
return docs | |
def prepare_text_df(self,docs): | |
# Get the page_content from the documents and create a new list | |
content_list = [doc.page_content for doc in docs] | |
# Send one page_content at a time | |
print("Making embedding...") | |
embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)] | |
print("Finished embedding...") | |
# Create a dataframe to ingest it to the database | |
dataframe = pd.DataFrame({ | |
'page_content': content_list, | |
'embeddings': embeddings}) | |
return dataframe | |
def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False): | |
all_texts = self.read_data(path) | |
docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex) | |
dataframe = self.prepare_text_df(docs) | |
self.upload_file(dataframe) | |
return dataframe | |
def upload_file(self,embedding_df,name='first_aid'): | |
''' | |
Upload data and close database to be commited | |
''' | |
self.conn.make_data_frame(embedding_df,name) | |
self.conn.close() | |
def query_relevant(self,user_query): | |
embedded_query = self.embedder.embed(user_query) | |
result = self.conn.get_relevant_docs(embedded_query) | |
return result |