File size: 3,382 Bytes
2ebf9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdmk
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate
from langchain.document_loaders import PyPDFLoader
import pandas as pd
import duckdb
import numpy as np
import os


class RAG_Retrival:
    def __init__(self,db,model,embedder):
        self.conn = db
        self.model = model
        self.embedder = embedder

    def read_data(self,path_data):
        # Count total files first for tqdm's total
        total_files = sum(len(files) for _, _, files in os.walk(path_data))
        all_text = ""
        with tqdm(total=total_files, desc="Reading files", unit="file") as pbar:
            for root, dirs, files in os.walk(path_data):
                for file in files:
                    full_path = os.path.join(root, file)
                    if full_path.endswith(".txt"):
                      all_text += self.load_text_file(full_path)
                    elif full_path.endswith(".pdf"):
                      all_text += self.load_pdf(full_path)


                    pbar.update(1)

        return all_text


    def load_text_file(self,path):
        text = ""
        with open(path, 'r') as file:
          for line in file:
              text += line
              
        return text


    def load_pdf(self,pdf_folder):
        loader = PyPDFLoader(pdf_folder)
        pages = loader.load_and_split()
        text = "\n".join([doc.page_content for doc in pages])
        return text


    def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=is_separator_regex,
        )
        docs = text_splitter.create_documents([text])
        for i, d in enumerate(docs):
            d.metadata = {"doc_id": i}
        return docs

    def prepare_text_df(self,docs):
        # Get the page_content from the documents and create a new list
        content_list = [doc.page_content for doc in docs]
        # Send one page_content at a time
        print("Making embedding...")
        embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)]
        print("Finished embedding...")

        # Create a dataframe to ingest it to the database
        dataframe = pd.DataFrame({
            'page_content': content_list,
            'embeddings': embeddings})
        return dataframe

    def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
        all_texts = self.read_data(path)
        docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex)
        dataframe = self.prepare_text_df(docs)
        self.upload_file(dataframe)
        return dataframe


    def upload_file(self,embedding_df,name='first_aid'):
        '''
        Upload data and close database to be commited
        '''
        self.conn.make_data_frame(embedding_df,name)
        self.conn.close()

    def query_relevant(self,user_query):
        embedded_query = self.embedder.embed(user_query)
        result = self.conn.get_relevant_docs(embedded_query)
        return result