File size: 4,054 Bytes
0f4521b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28ba9e4
0f4521b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74f1e2b
0f4521b
 
 
755ed65
e799a24
0f4521b
e799a24
0f4521b
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#GRADIO INTERFACE TO CONVERT A PDF TO TEXT AND READ IT WITH LANGCHAIN AND OPEN AI ###################################
import gradio as gr
import PyPDF2, os, sys, random, time, shutil
from pypdf import PdfReader
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI
import openai

directory_path = '/converted_pdf_to_text'


def extract_info(pdf_file):
    
    #BEGINS PDF TO TEXT SECTION ###################
    if pdf_file.name.lower().endswith('.pdf'):
        reader = PdfReader(pdf_file.name)
        pages = reader.pages
        extracted_text = [i.extract_text() for i in pages]
         
        #WRITING TEXT FILE TO FOLDER ##############    
        directory_name = 'converted_pdf_to_text'
        if not os.path.exists(directory_name):
            os.mkdir(directory_name)
        file_name = 'document_in_txt_format.txt'
        file_path = os.path.join(directory_name, file_name)
        with open(file_path, 'w', encoding = 'UTF-8') as f:
            f.write(str(extracted_text))
        if os.path.isfile(file_path):
            print(f'{file_name} created successfully in {directory_name}.')
        else:
             print(f"{file_name} creation in {directory_name} failed.")

        #BEGINS LLM SECTION ##########
        max_input_size = 4096
        num_outputs = 500
        max_chunk_overlap = 200
        chunk_size_limit = 4000

        llm_predictor = LLMPredictor(llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', max_tokens=num_outputs))
        prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

        documents = SimpleDirectoryReader(directory_name).load_data()
        global index
        index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

        #Remove json file if it exists to make sure it's not using a previous index file as source
        if os.path.exists("index.json"):
            os.remove("index.json")
            print("The file 'index.json' has been deleted.")
        else:
            print("The file 'index.json' does not exist.")

        
        #Save json index to disk from current information
        index.save_to_disk('index.json')
        
        #Remove directory with initial text file
        #shutil.rmtree(directory_name)
        return ("Success! You can now click on the 'Knowledge bot' tab to interact with your document", extracted_text) 
      

def chat(chat_history, user_input):
    
    bot_response = index.query(user_input)
    response = ''
    #Show each letter progressively
    for letter in ''.join(bot_response.response):
        response += letter + ""
        yield chat_history + [(user_input, response)]

        
        
messages = [{"role": "system", "content": """You are a helpful assistant. You help the reader understand documents paraphrasing, quoting and summarizing information. You follow the instructions of the user at all times"""}]
 
openai.api_key = os.getenv("OPENAI_API_KEY")


with gr.Blocks() as demo:
    
    gr.Markdown('<h1 style="color:blue; font-size:20px; text-align: center; justify-content: center; font-weight:bold;">Q&A bot for PDF docs. Upload your document, press the button, and wait for confirmation of success</h1>')
    
    with gr.Tab('Input PDF document here'):
        text_input = gr.File()
        text_output = gr.Textbox(label= "Extracted Text")
        success_output = gr.Textbox()
        text_button = gr.Button('Build the bot!')
        text_button.click(extract_info, text_input, [success_output,text_output])
    with gr.Tab('Knowledge bot'):
        chatbot = gr.Chatbot()
        message = gr.Textbox(label = 'Ask here your question about the document, then press "enter" and scroll up for response')
        message.submit(chat, [chatbot, message], chatbot)
    
    
demo.queue().launch(debug = True)


if __name__ == "__main__":
    demo.launch()