ak0601 commited on
Commit
6a8bd7b
·
verified ·
1 Parent(s): c4897d1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +316 -243
app.py CHANGED
@@ -1,243 +1,316 @@
1
- import streamlit as st
2
- from langchain.memory import ConversationBufferMemory
3
- from llama_index.core.indices.query.schema import QueryBundle
4
- from llama_index.core import Document, VectorStoreIndex
5
- from llama_index.core.text_splitter import SentenceSplitter
6
- from llama_index.core.retrievers import QueryFusionRetriever
7
- from llama_index.retrievers.bm25 import BM25Retriever
8
- from llama_index.core.postprocessor import SentenceTransformerRerank
9
- from llama_index.core.prompts import PromptTemplate
10
- from llama_index.core.query_engine import RetrieverQueryEngine
11
- from llama_index.embeddings.gemini import GeminiEmbedding
12
- from llama_index.llms.gemini import Gemini
13
- from llama_index.core import Settings
14
- from llama_index.vector_stores.faiss import FaissVectorStore
15
- from llama_index.core import (
16
- SimpleDirectoryReader,
17
- load_index_from_storage,
18
- VectorStoreIndex,
19
- StorageContext,
20
- )
21
- from llama_index.core.node_parser import SemanticSplitterNodeParser
22
-
23
- import os
24
- import faiss
25
- import pickle
26
- import spacy
27
-
28
- # Load NLP model
29
- # nlp = spacy.load("en_core_web_sm")
30
-
31
-
32
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
33
-
34
- # Function to load documents
35
- def load_documents(filename="documents.pkl"):
36
- with open(filename, "rb") as file:
37
- return pickle.load(file)
38
-
39
- # Load stored documents
40
- loaded_docs = load_documents()
41
-
42
- # Function to split text into sentences
43
- # def spacy_sentence_splitter(text):
44
- # doc = nlp(text)
45
- # return [sent.text for sent in doc.sents]
46
- embed_model = GeminiEmbedding(model_name="models/embedding-001", use_async=False)
47
- splitter = SemanticSplitterNodeParser(
48
- buffer_size=5, breakpoint_percentile_threshold=95, embed_model=embed_model
49
- )
50
- # splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50, separator="\n")
51
- nodes = splitter.get_nodes_from_documents([doc for doc in loaded_docs])
52
- chunked_documents = [Document(text=node.text, metadata=node.metadata) for node in nodes]
53
- # Process documents
54
- # chunked_documents = [
55
- # Document(text=chunk_text, metadata=doc.metadata)
56
- # for doc in loaded_docs for chunk_text in spacy_sentence_splitter(doc.text)
57
- # ]
58
-
59
- # Configure LLM and embeddings
60
- Settings.llm = Gemini(model="models/gemini-2.0-flash", api_key=GOOGLE_API_KEY, temperature=0.5)
61
-
62
- dimension = 768
63
- faiss_index = faiss.IndexFlatL2(dimension)
64
- vector_store = FaissVectorStore(faiss_index=faiss_index)
65
- storage_context = StorageContext.from_defaults(vector_store=vector_store)
66
-
67
- # Build index
68
- index = VectorStoreIndex.from_documents(
69
- documents=chunked_documents,
70
- storage_context=storage_context,
71
- embed_model=embed_model,
72
- show_progress=True
73
- )
74
- index.storage_context.persist()
75
-
76
- # Initialize memory
77
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
78
-
79
- def get_chat_history():
80
- return memory.load_memory_variables({})["chat_history"]
81
-
82
- # Define chatbot prompt template
83
- prompt_template = PromptTemplate(
84
- """You are a friendly college counselor with expertise in Indian technical institutes.
85
- Previous conversation context (if any):\n{chat_history}\n\n
86
- Available college information:\n{context_str}\n\n"
87
- User query: {query_str}\n\n
88
- Instructions:\n
89
- 1. Provide a brief, direct answer using only the information available above\n
90
- 2. If specific data is not available, clearly state that\n
91
- 3. Keep responses under 3 sentences when possible\n
92
- 4. If comparing colleges, use bullet points for clarity\n
93
- 5. Use a friendly, conversational tone\n
94
- 6. Always be interactive and ask follow-up questions\n
95
- 7. Always try to give answers in points each point should focus on single aspect of the response.\n
96
- 8. Always try to give conclusion of your answer in the end for the user to take a decision.\n
97
- 9. Whenever you don't know the answer to a particular query, just reply with conatct to the respective college mentor to the user. \n
98
- 10. Try to give answers in tabular format wherever needed.
99
- Response:"""
100
- )
101
-
102
- # Configure retrieval and query engine
103
- vector_retriever = index.as_retriever(similarity_top_k=10)
104
- bm25_retriever = BM25Retriever.from_defaults(index=index, similarity_top_k=10)
105
- hybrid_retriever = QueryFusionRetriever(
106
- [vector_retriever, bm25_retriever],
107
- similarity_top_k=10,
108
- num_queries=10,
109
- mode="reciprocal_rerank",
110
- use_async=False
111
- )
112
-
113
- reranker = SentenceTransformerRerank(
114
- model="cross-encoder/ms-marco-MiniLM-L-2-v2",
115
- top_n=10,
116
- )
117
-
118
- query_engine = RetrieverQueryEngine.from_args(
119
- retriever=hybrid_retriever,
120
- node_postprocessors=[reranker],
121
- llm=Settings.llm,
122
- verbose=True,
123
- prompt_template=prompt_template,
124
- use_async=False,
125
- )
126
-
127
- # Streamlit UI
128
- st.title("📚 Precollege Chatbot")
129
- st.write("Ask me anything about different colleges and their courses!")
130
-
131
- # Custom CSS for WhatsApp-like interface
132
- st.markdown("""
133
- <style>
134
- body {
135
- background-color: #111b21;
136
- color: #e9edef;
137
- }
138
- .stApp {
139
- background-color: #111b21;
140
- }
141
- .chat-container {
142
- padding: 10px;
143
- color: #111b21;
144
- }
145
- .user-message {
146
- background-color: #005c4b;
147
- color: #e9edef;
148
- padding: 10px 15px;
149
- border-radius: 15px;
150
- margin: 5px 0;
151
- max-width: 70%;
152
- margin-left: auto;
153
- margin-right: 10px;
154
- }
155
- .ai-message {
156
- background-color: #1f2c33;
157
- color: #e9edef;
158
- padding: 10px 15px;
159
- border-radius: 15px;
160
- margin: 5px 0;
161
- max-width: 70%;
162
- margin-right: auto;
163
- margin-left: 10px;
164
- box-shadow: 0 1px 2px rgba(255,255,255,0.1);
165
- }
166
- .message-container {
167
- display: flex;
168
- margin-bottom: 10px;
169
- }
170
- .stTextInput input {
171
- border-radius: 20px;
172
- padding: 10px 20px;
173
- border: 1px solid #ccc;
174
- background-color: #2a3942;
175
- color: #e9edef;
176
- }
177
- .stButton button {
178
- border-radius: 50%; /* Make it circular */
179
- width: 40px;
180
- height: 40px;
181
- padding: 0px;
182
- background-color: #005c4b;
183
- color: #e9edef;
184
- font-size: 20px;
185
- display: flex;
186
- align-items: center;
187
- justify-content: center;
188
- border: none;
189
- cursor: pointer;
190
- }
191
- .stButton button:hover {
192
- background-color: #00735e;
193
- }
194
- div[data-testid="stToolbar"] {
195
- display: none;
196
- }
197
- .stMarkdown {
198
- color: #e9edef;
199
- }
200
- header {
201
- background-color: #202c33 !important;
202
- }
203
- </style>
204
- """, unsafe_allow_html=True)
205
-
206
- if "chat_history" not in st.session_state:
207
- st.session_state.chat_history = []
208
-
209
- # Create a container for chat messages
210
- chat_container = st.container()
211
-
212
- # Create a form for input
213
- with st.form(key="message_form", clear_on_submit=True):
214
- col1, col2 = st.columns([5,1])
215
- with col1:
216
- user_input = st.text_input("", placeholder="Type a message...", label_visibility="collapsed")
217
- with col2:
218
- submit_button = st.form_submit_button("➤")
219
-
220
- if submit_button and user_input.strip():
221
- chat_history = get_chat_history()
222
- query_bundle = QueryBundle(query_str=f"{chat_history}\n\nUser: {user_input}")
223
- response_obj = query_engine.query(query_bundle)
224
- response_text = str(response_obj.response) if hasattr(response_obj, "response") else str(response_obj)
225
-
226
- memory.save_context({"query_str": user_input}, {"response": response_text})
227
- st.session_state.chat_history.append(("You", user_input))
228
- st.session_state.chat_history.append(("AI", response_text))
229
-
230
- # Display chat history with custom styling
231
- with chat_container:
232
- for role, message in st.session_state.chat_history:
233
- message = message.replace("</div>", "").replace("<div>", "") # Sanitize the message
234
- if role == "You":
235
- st.markdown(
236
- f'<div class="message-container"><div class="user-message">{message}</div></div>',
237
- unsafe_allow_html=True
238
- )
239
- else:
240
- st.markdown(
241
- f'<div class="message-container"><div class="ai-message">{message}</div></div>',
242
- unsafe_allow_html=True
243
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.memory import ConversationBufferMemory
3
+ from llama_index.core.indices.query.schema import QueryBundle
4
+ from llama_index.core import Document, VectorStoreIndex
5
+ from llama_index.core.text_splitter import SentenceSplitter
6
+ from llama_index.core.retrievers import QueryFusionRetriever
7
+ from llama_index.retrievers.bm25 import BM25Retriever
8
+ from llama_index.core.postprocessor import SentenceTransformerRerank
9
+ from llama_index.core.prompts import PromptTemplate
10
+ from llama_index.core.query_engine import RetrieverQueryEngine
11
+ from llama_index.embeddings.gemini import GeminiEmbedding
12
+ from llama_index.llms.gemini import Gemini
13
+ from llama_index.core import Settings
14
+ from llama_index.vector_stores.faiss import FaissVectorStore
15
+ from llama_index.core import (
16
+ SimpleDirectoryReader,
17
+ load_index_from_storage,
18
+ VectorStoreIndex,
19
+ StorageContext,
20
+ )
21
+ from llama_index.core.node_parser import SemanticSplitterNodeParser
22
+
23
+ import os
24
+ import faiss
25
+ import pickle
26
+ import spacy
27
+
28
+ # Load NLP model
29
+ # nlp = spacy.load("en_core_web_sm")
30
+
31
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
32
+
33
+ # Function to load documents
34
+ def load_documents(filename="documents.pkl"):
35
+ with open(filename, "rb") as file:
36
+ return pickle.load(file)
37
+
38
+ # Load stored documents
39
+ loaded_docs = load_documents()
40
+
41
+ # Function to split text into sentences
42
+ # def spacy_sentence_splitter(text):
43
+ # doc = nlp(text)
44
+ # return [sent.text for sent in doc.sents]
45
+ embed_model = GeminiEmbedding(model_name="models/embedding-001", use_async=False)
46
+ splitter = SemanticSplitterNodeParser(
47
+ buffer_size=5, breakpoint_percentile_threshold=95, embed_model=embed_model
48
+ )
49
+ # splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50, separator="\n")
50
+ nodes = splitter.get_nodes_from_documents([doc for doc in loaded_docs])
51
+ chunked_documents = [Document(text=node.text, metadata=node.metadata) for node in nodes]
52
+ # Process documents
53
+ # chunked_documents = [
54
+ # Document(text=chunk_text, metadata=doc.metadata)
55
+ # for doc in loaded_docs for chunk_text in spacy_sentence_splitter(doc.text)
56
+ # ]
57
+
58
+ # Configure LLM and embeddings
59
+ Settings.llm = Gemini(model="models/gemini-2.0-flash", api_key=GOOGLE_API_KEY, temperature=0.5)
60
+
61
+ dimension = 768
62
+ faiss_index = faiss.IndexFlatL2(dimension)
63
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
64
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
65
+
66
+ # Build index
67
+ index = VectorStoreIndex.from_documents(
68
+ documents=chunked_documents,
69
+ storage_context=storage_context,
70
+ embed_model=embed_model,
71
+ show_progress=True
72
+ )
73
+ index.storage_context.persist()
74
+
75
+ # Initialize memory
76
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
77
+
78
+ def get_chat_history():
79
+ return memory.load_memory_variables({})["chat_history"]
80
+
81
+ # Define chatbot prompt template
82
+ prompt_template = PromptTemplate(
83
+ """You are a friendly college counselor with expertise in Indian technical institutes.
84
+ Previous conversation context (if any):\n{chat_history}\n\n
85
+ Available college information:\n{context_str}\n\n"
86
+ User query: {query_str}\n\n
87
+ Instructions:\n
88
+ 1. Provide a brief, direct answer using only the information available above\n
89
+ 2. If specific data is not available, clearly state that\n
90
+ 3. Keep responses under 3 sentences when possible\n
91
+ 4. If comparing colleges, use bullet points for clarity\n
92
+ 5. Use a friendly, conversational tone\n
93
+ 6. Always be interactive and ask follow-up questions\n
94
+ 7. Always try to give answers in points each point should focus on single aspect of the response.\n
95
+ 8. Always try to give conclusion of your answer in the end for the user to take a decision.\n
96
+ Response:"""
97
+ )
98
+
99
+ # Configure retrieval and query engine
100
+ vector_retriever = index.as_retriever(similarity_top_k=10)
101
+ bm25_retriever = BM25Retriever.from_defaults(index=index, similarity_top_k=10)
102
+ hybrid_retriever = QueryFusionRetriever(
103
+ [vector_retriever, bm25_retriever],
104
+ similarity_top_k=10,
105
+ num_queries=10,
106
+ mode="reciprocal_rerank",
107
+ use_async=False
108
+ )
109
+
110
+ reranker = SentenceTransformerRerank(
111
+ model="cross-encoder/ms-marco-MiniLM-L-2-v2",
112
+ top_n=10,
113
+ )
114
+
115
+ query_engine = RetrieverQueryEngine.from_args(
116
+ retriever=hybrid_retriever,
117
+ node_postprocessors=[reranker],
118
+ llm=Settings.llm,
119
+ verbose=True,
120
+ prompt_template=prompt_template,
121
+ use_async=False,
122
+ )
123
+
124
+ # Streamlit UI
125
+ st.title("📚 Precollege Chatbot")
126
+ st.write("Ask me anything about different colleges and their courses!")
127
+
128
+ # Custom CSS for WhatsApp-like interface
129
+ st.markdown("""
130
+ <style>
131
+ body {
132
+ background-color: #111b21;
133
+ color: #e9edef;
134
+ }
135
+ .stApp {
136
+ background-color: #111b21;
137
+ }
138
+ .chat-container {
139
+ padding: 10px;
140
+ color: #111b21;
141
+ }
142
+ .user-message {
143
+ background-color: #005c4b;
144
+ color: #e9edef;
145
+ padding: 10px 15px;
146
+ border-radius: 15px;
147
+ margin: 5px 0;
148
+ max-width: 70%;
149
+ margin-left: auto;
150
+ margin-right: 10px;
151
+ }
152
+ .ai-message {
153
+ background-color: #1f2c33;
154
+ color: #e9edef;
155
+ padding: 10px 15px;
156
+ border-radius: 15px;
157
+ margin: 5px 0;
158
+ max-width: 70%;
159
+ margin-right: auto;
160
+ margin-left: 10px;
161
+ box-shadow: 0 1px 2px rgba(255,255,255,0.1);
162
+ }
163
+ .ai-message table {
164
+ border-collapse: collapse;
165
+ width: 100%;
166
+ margin: 10px 0;
167
+ }
168
+ .ai-message th, .ai-message td {
169
+ border: 1px solid #e9edef;
170
+ padding: 8px;
171
+ text-align: left;
172
+ }
173
+ .ai-message th {
174
+ background-color: #2a3942;
175
+ }
176
+ .message-container {
177
+ display: flex;
178
+ margin-bottom: 10px;
179
+ }
180
+ .stTextInput input {
181
+ border-radius: 20px;
182
+ padding: 10px 20px;
183
+ border: 1px solid #ccc;
184
+ background-color: #2a3942;
185
+ color: #e9edef;
186
+ }
187
+ .stButton button {
188
+ border-radius: 50%; /* Make it circular */
189
+ width: 40px;
190
+ height: 40px;
191
+ padding: 0px;
192
+ background-color: #005c4b;
193
+ color: #e9edef;
194
+ font-size: 20px;
195
+ display: flex;
196
+ align-items: center;
197
+ justify-content: center;
198
+ border: none;
199
+ cursor: pointer;
200
+ }
201
+ .stButton button:hover {
202
+ background-color: #00735e;
203
+ }
204
+ div[data-testid="stToolbar"] {
205
+ display: none;
206
+ }
207
+ .stMarkdown {
208
+ color: #e9edef;
209
+ }
210
+ header {
211
+ background-color: #202c33 !important;
212
+ }
213
+ .ai-message table.ai-table {
214
+ border-collapse: collapse;
215
+ width: 100%;
216
+ margin: 10px 0;
217
+ background-color: #2a3942;
218
+ }
219
+
220
+ .ai-message table.ai-table th,
221
+ .ai-message table.ai-table td {
222
+ border: 1px solid #e9edef;
223
+ padding: 8px;
224
+ text-align: left;
225
+ color: #e9edef;
226
+ }
227
+
228
+ .ai-message table.ai-table th {
229
+ background-color: #005c4b;
230
+ font-weight: bold;
231
+ }
232
+
233
+ .ai-message table.ai-table tr:nth-child(even) {
234
+ background-color: #1f2c33;
235
+ }
236
+ </style>
237
+ """, unsafe_allow_html=True)
238
+
239
+ if "chat_history" not in st.session_state:
240
+ st.session_state.chat_history = []
241
+
242
+ # Create a container for chat messages
243
+ chat_container = st.container()
244
+
245
+ # Create a form for input
246
+ with st.form(key="message_form", clear_on_submit=True):
247
+ col1, col2 = st.columns([5,1])
248
+ with col1:
249
+ user_input = st.text_input("", placeholder="Type a message...", label_visibility="collapsed")
250
+ with col2:
251
+ submit_button = st.form_submit_button("➤")
252
+
253
+ if submit_button and user_input.strip():
254
+ chat_history = get_chat_history()
255
+ query_bundle = QueryBundle(query_str=f"{chat_history}\n\nUser: {user_input}")
256
+ response_obj = query_engine.query(query_bundle)
257
+ response_text = str(response_obj.response) if hasattr(response_obj, "response") else str(response_obj)
258
+
259
+ memory.save_context({"query_str": user_input}, {"response": response_text})
260
+ st.session_state.chat_history.append(("You", user_input))
261
+ st.session_state.chat_history.append(("AI", response_text))
262
+
263
+ # Display chat history with custom styling
264
+ with chat_container:
265
+ for role, message in st.session_state.chat_history:
266
+ message = message.replace("</div>", "").replace("<div>", "") # Sanitize the message
267
+ if role == "You":
268
+ st.markdown(
269
+ f'<div class="message-container"><div class="user-message">{message}</div></div>',
270
+ unsafe_allow_html=True
271
+ )
272
+ else:
273
+ # Convert markdown tables to HTML tables with proper styling
274
+ if "|" in message and "-|-" in message: # Detect markdown tables
275
+ # Split the message into lines
276
+ lines = message.split("\n")
277
+ table_html = []
278
+ in_table = False
279
+ formatted_lines = []
280
+
281
+ for line in lines:
282
+ if "|" in line:
283
+ if not in_table:
284
+ in_table = True
285
+ table_html.append('<table class="ai-table">')
286
+ # Add header
287
+ header = line.strip().strip("|").split("|")
288
+ table_html.append("<tr>")
289
+ for h in header:
290
+ table_html.append(f"<th>{h.strip()}</th>")
291
+ table_html.append("</tr>")
292
+ elif "-|-" not in line: # Skip separator line
293
+ # Add row
294
+ row = line.strip().strip("|").split("|")
295
+ table_html.append("<tr>")
296
+ for cell in row:
297
+ table_html.append(f"<td>{cell.strip()}</td>")
298
+ table_html.append("</tr>")
299
+ else:
300
+ if in_table:
301
+ in_table = False
302
+ table_html.append("</table>")
303
+ formatted_lines.append("".join(table_html))
304
+ table_html = []
305
+ formatted_lines.append(line)
306
+
307
+ if in_table:
308
+ table_html.append("</table>")
309
+ formatted_lines.append("".join(table_html))
310
+
311
+ message = "\n".join(formatted_lines)
312
+
313
+ st.markdown(
314
+ f'<div class="message-container"><div class="ai-message">{message}</div></div>',
315
+ unsafe_allow_html=True
316
+ )