ArturG9 commited on
Commit
255d5ac
·
verified ·
1 Parent(s): 80ac152

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from PyPDF2 import PdfReader
5
+ from langchain_community.llms import llamacpp
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
8
+ from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
9
+ from langchain.vectorstores import Chroma
10
+ from langchain.chat_models import ChatOpenAI
11
+ from langchain_community.chat_message_histories.streamlit import StreamlitChatMessageHistory
12
+ from langchain.prompts import PromptTemplate,SystemMessagePromptTemplate,ChatPromptTemplate
13
+ from langchain.chains.combine_documents import create_stuff_documents_chain
14
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain, ConversationalRetrievalChain
15
+ from langchain.text_splitter import TokenTextSplitter,RecursiveCharacterTextSplitter
16
+ from langchain_core.runnables.history import RunnableWithMessageHistory
17
+ from langchain_community.document_loaders.directory import DirectoryLoader
18
+ from langchain.document_loaders import PyPDFLoader
19
+ from htmlTemplates import css, bot_template, user_template
20
+ from langchain.memory import ConversationBufferMemory
21
+ from langchain.chains import ConversationalRetrievalChain
22
+ from langchain_core.output_parsers import StrOutputParser
23
+ from langchain_core.runnables import RunnablePassthrough
24
+ from langchain import hub
25
+
26
+
27
+
28
+
29
+
30
+
31
+ lang_api_key = os.getenv("lang_api_key")
32
+
33
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
34
+ os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
35
+ os.environ["LANGCHAIN_API_KEY"] = lang_api_key
36
+ os.environ["LANGCHAIN_PROJECT"] = "Chat with multiple PDFs"
37
+
38
+
39
+ def get_pdf_text(pdf_docs):
40
+ text = ""
41
+ for pdf in pdf_docs:
42
+ pdf_reader = PdfReader(pdf)
43
+ for page in pdf_reader.pages:
44
+ text += page.extract_text()
45
+ return text
46
+
47
+ def get_text_chunks(text):
48
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
49
+ chunk_size=250, chunk_overlap=50,
50
+ separators=["\n \n \n", "\n \n", "\n1", "(?<=\. )", " ", ""],
51
+ )
52
+ chunks = text_splitter.split_text(text)
53
+ return chunks
54
+
55
+ def get_vectorstore(text_chunks):
56
+ model_name = "Alibaba-NLP/gte-base-en-v1.5"
57
+ model_kwargs = {'device': 'cpu',
58
+ "trust_remote_code" : 'True'}
59
+ encode_kwargs = {'normalize_embeddings': True}
60
+ embeddings = HuggingFaceEmbeddings(
61
+ model_name=model_name,
62
+ model_kwargs=model_kwargs,
63
+ encode_kwargs=encode_kwargs
64
+ )
65
+ vectorstore = Chroma.from_texts(
66
+ texts=text_chunks, embedding=embeddings, persist_directory="docs/chroma/")
67
+ return vectorstore
68
+
69
+
70
+
71
+ def get_conversation_chain():
72
+
73
+ callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
74
+
75
+
76
+ llm = llamacpp.LlamaCpp(
77
+ model_path="qwen2-0_5b-instruct-q8_0.gguf",
78
+ n_gpu_layers=0,
79
+ temperature=0.1,
80
+ top_p = 0.9,
81
+ n_ctx=20000,
82
+ n_batch=2000,
83
+ max_tokens = 300,
84
+ repeat_penalty=1.9,
85
+ last_n_tokens_size = 300,
86
+
87
+ #callback_manager=callback_manager,
88
+ verbose=False,
89
+ )
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+ prompt = hub.pull("rlm/rag-prompt")
99
+ rag_chain = prompt | llm | StrOutputParser()
100
+
101
+
102
+ return rag_chain
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+ def main():
115
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
116
+ st.write(css, unsafe_allow_html=True)
117
+
118
+ st.header("Chat with multiple PDFs :books:")
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+ if user_question := st.text_input("Ask a question about your documents:"):
127
+ handle_userinput(user_question, vectorstore, conversation)
128
+
129
+ st.subheader("Your documents")
130
+ pdf_docs = st.file_uploader("For Chatbot to get alive, upload your PDFs here and click on 'Process'", accept_multiple_files=True)
131
+
132
+
133
+
134
+ if st.button("Process"):
135
+ with st.spinner("Processing"):
136
+ if pdf_docs:
137
+ # get pdf text
138
+ raw_text = get_pdf_text(pdf_docs)
139
+
140
+ # get the text chunks
141
+ text_chunks = get_text_chunks(raw_text)
142
+
143
+ # create vector store
144
+ vectorstore = get_vectorstore(text_chunks)
145
+
146
+ # create conversation chain
147
+ conversation = get_conversation_chain()
148
+
149
+ st.success("Files have been processed into a vector store.")
150
+
151
+ return vectorstore , conversation
152
+
153
+
154
+
155
+
156
+
157
+
158
+ st.subheader("Chat Bot")
159
+ if user_question := st.text_input("Ask a question about your documents:"):
160
+ handle_userinput(user_question, vectorstore, conversation)
161
+
162
+
163
+
164
+ def handle_userinput(user_question,vectorstore,conversation ):
165
+
166
+ if "chat_history" not in st.session_state:
167
+ st.session_state["chat_history"] = [
168
+ {"role": "assistant", "content": "Hi, I'm a Q&A chatbot who is based on your imported pdf documents . How can I help you?"}
169
+ ]
170
+
171
+
172
+ st.session_state.chat_history.append({"role": "user", "content": user_question})
173
+
174
+
175
+ retriever = vectorstore.as_retriever(search_type = 'mmr', search_kwargs={"k": 7})
176
+ docs = retriever.invoke(user_question)
177
+ with st.sidebar:
178
+ st.subheader("Your documents")
179
+ with st.spinner("Processing"):
180
+ for doc in docs:
181
+ st.write(f"Document: {doc}")
182
+
183
+ doc_txt = [doc.page_content for doc in docs]
184
+
185
+ # Invoke conversation chain
186
+ response = conversation.invoke({"context": docs, "question": user_question})
187
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
188
+
189
+ for i, message in enumerate(st.session_state.chat_history):
190
+ if i % 2 == 0:
191
+ st.write(user_template.replace(
192
+ "{{MSG}}", message['content']), unsafe_allow_html=True)
193
+ else:
194
+ st.write(bot_template.replace(
195
+ "{{MSG}}", message['content']), unsafe_allow_html=True)
196
+
197
+ # if 'source_documents' in response:
198
+ # st.subheader("Retrieved Documents")
199
+ # for doc in response['source_documents']:
200
+ # st.write(f"Document: {doc.metadata['source']}")
201
+ # st.write(doc.page_content)
202
+
203
+
204
+
205
+ if __name__ == '__main__':
206
+ main()