File size: 17,705 Bytes
a3844fa
 
 
 
 
 
fe29cec
 
 
 
a3844fa
fe29cec
abf0d78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe29cec
abf0d78
 
fe29cec
52ded98
abf0d78
 
 
52ded98
 
fe29cec
 
52ded98
 
 
 
 
fe29cec
abf0d78
fe29cec
 
abf0d78
fe29cec
abf0d78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe29cec
69f5a0f
 
fe29cec
69f5a0f
 
 
 
 
 
 
30549d9
fe29cec
 
30549d9
 
69f5a0f
fe29cec
69f5a0f
abf0d78
 
 
30549d9
abf0d78
 
 
fe29cec
1253858
 
 
 
 
abf0d78
 
 
 
 
 
 
 
fe29cec
3418ad9
abf0d78
 
69f5a0f
 
1253858
abf0d78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aeacfd0
abf0d78
 
 
 
aeacfd0
abf0d78
aeacfd0
 
 
 
 
 
abf0d78
 
 
 
 
 
aeacfd0
 
abf0d78
aeacfd0
abf0d78
 
 
 
 
aeacfd0
 
 
 
 
 
 
abf0d78
 
 
 
 
 
 
 
 
aeacfd0
abf0d78
 
 
aeacfd0
 
abf0d78
fe29cec
 
 
 
 
abf0d78
 
3418ad9
abf0d78
 
 
 
 
 
 
 
 
fe29cec
3418ad9
fe29cec
abf0d78
 
 
fe29cec
69f5a0f
abf0d78
 
 
c711d65
fe29cec
 
1253858
 
abf0d78
1253858
ca8499e
abf0d78
922048a
0cc8106
fe29cec
 
 
abf0d78
 
 
 
 
 
fe29cec
1253858
fe29cec
abf0d78
fe29cec
 
 
abf0d78
 
fe29cec
c861239
fe29cec
 
c861239
 
fe29cec
 
c711d65
 
fe29cec
c711d65
fe29cec
52ded98
1253858
52ded98
 
 
fe29cec
 
 
 
 
 
 
52ded98
abf0d78
 
9827cc8
fe29cec
868799e
9827cc8
868799e
 
fe29cec
abf0d78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe29cec
abf0d78
 
 
fe29cec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69f5a0f
 
 
 
 
 
fe29cec
 
 
 
 
 
abf0d78
 
 
 
 
52ded98
abf0d78
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
"""
The primary function of this application is artificial intelligence-based question answering. In the dynamic landscape of AI, new 
technologies and trends constantly emerge, rendering conventional data insufficient to address real-time challenges. To tackle this 
issue, the application leverages advanced Retrieval Augmented Generation (RAG) and content scraping techniques. Despite having limited 
knowledge due to storage and cost constraints with OpenAI, the application possesses some understanding of data science stored in vector 
format. When users inquire about topics not covered in the custom data, the application utilizes SERPAPI and advanced RAG methods to 
incorporate unavailable context and resolve knowledge gaps effectively. The project initially utilizes pre-generated embeddings from a small pool of research papers .
However, generating content from recent articles and research papers, which require new vector embeddings each time, presents challenges.
The project aims to mitigate costs, minimize hallucinations, and enhance accuracy in its approach.

"""

import os
import os.path
import serpapi
import requests
import feedparser
import streamlit as st
from typing import List
from docx import Document
from bs4 import BeautifulSoup
import huggingface_hub as hfh
from urllib.parse import quote
from llama_index.llms.openai import OpenAI
from langchain_community.document_loaders import WebBaseLoader
from llama_index.embeddings.openai import OpenAIEmbedding
from langchain_community.document_loaders import PyPDFLoader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.node_parser import get_leaf_nodes, HierarchicalNodeParser, get_root_nodes, SentenceSplitter
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SimilarityPostprocessor
from llama_index.core import (VectorStoreIndex, SimpleDirectoryReader, ServiceContext, load_index_from_storage,
                              StorageContext, Document, Settings, get_response_synthesizer, set_global_service_context)

import warnings

warnings.filterwarnings("ignore")

if 'vector_index' not in st.session_state:
    st.session_state.vector_index = None
if 'cohere_api_key' not in st.session_state:
    st.session_state.cohere_api_key = None
if 'serp_api_key' not in st.session_state:
    st.session_state.serp_api_key = None
if 'storage_context' not in st.session_state:
    st.session_state.storage_context = None

st.set_page_config(
    page_title="Quik Querium AI Genie",
    page_icon="🧞",
    layout="wide",
    initial_sidebar_state="expanded",
)


def setting_api_key(openai_api_key, serp_api_key):
    try:
        os.environ['OPENAI_API_KEY'] = openai_api_key
        st.session_state.hf_token = os.getenv("hf_token")
        hfh.login(token=st.session_state.hf_token)
        os.environ["COHERE_API_KEY"] = os.getenv("cohere_api_key")

        st.session_state.serp_api_key = serp_api_key

    except Exception as e:
        st.warning(e)


template = """<|system|>
              you are a Question answering system based AI, Machine Learning , Deep Learning , Generative AI,
              Data science, Data Analytics and Mathematics.
              Mention Clearly Before response " RAG Output :\n".
              Please check if the following pieces of context has any mention of the keywords provided
              in the question.Generate response as much as you could with context you get.
              if the following pieces of Context does not relate to Question, You must not answer on your own, you don't know the answer,
              </s>
              <|user|>
              Question:{query_str}</s>
              <|assistant|> """

Settings.llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0.1, model_kwargs={'trust_remote_code': True},
                      max_tokens=512, system_prompt=template)

Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

service_context = ServiceContext.from_defaults(embed_model=Settings.embed_model, llm=Settings.llm)
set_global_service_context(service_context)


def hierarchical_split(documents):
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128])
    nodes = node_parser.get_nodes_from_documents(documents)
    return nodes


def hierarchical_split_research_paper_article(documents):
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[512, 256, 64])
    nodes = node_parser.get_nodes_from_documents(documents)
    return nodes


def storage_ctx(nodes):
    docstore = SimpleDocumentStore()
    docstore.add_documents(nodes)
    storage_context = StorageContext.from_defaults(docstore=docstore)
    return storage_context


def saving_vectors(vector_index):  # Can't able to save Vectors in Huggingface Space
    vector_index.storage_context.persist(persist_dir="vector_index/")


def create_vector_index(nodes, storage_context):
    vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
    # saving_vectors(vector_index) # In hugging face the saving function didn't work
    return vector_index


def search_arxiv(query, max_results=8):
    encoded_query = quote(query)
    base_url = 'http://export.arxiv.org/api/query?'
    query_url = f'{base_url}search_query={encoded_query}&start=0&max_results={max_results}'
    feed = feedparser.parse(query_url)
    papers = []
    for entry in feed.entries:
        paper_info = {
            'Title': entry.title,
            'URL': entry.link
        }
        papers.append(paper_info)
    return papers


def remove_empty_lines(lines):
    non_empty_lines = [line for line in lines if line.strip()]
    return ' '.join(non_empty_lines)


def get_article_and_arxiv_content(query):
    # Article content
    serpapi_api_key = st.session_state.serp_api_key
    search_engine = "google"  # bing

    params = {
        "engine": "google",
        "gl": "us",
        "hl": "en",
        "api_key": serpapi_api_key,
        "q": query
    }
    serpapi_wrapper = serpapi.GoogleSearch(params)
    search_results = serpapi_wrapper.get_dict()
    results = []
    for result_type in ["organic_results", "related_questions"]:
        if result_type in search_results:
            for result in search_results[result_type]:
                if "title" in result and "link" in result:
                    # Extract title and link
                    item = {"title": result["title"], "link": result["link"]}
                    results.append(item)

    # Store Each article links in List
    links = [result['link'] for result in results]

    contents = []
    for link in links:
        response = requests.get(link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            content_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            document = ""
            for tag in content_tags:
                document += tag.text + "\n"

            if not document:
                loader = WebBaseLoader(link)
                document_ = loader.load()
                document = document_[0].page_content

            article = remove_empty_lines(document.split('\n'))
            contents.append(article)

    # arXiv content
    papers_to_download = search_arxiv(query)
    papers_urls = []
    for paper in papers_to_download:
        page_url = paper['URL']
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            download_link = soup.find("a", class_="abs-button download-pdf")
            if download_link:
                pdf_url = download_link['href']
                if not pdf_url.startswith("http"):
                    pdf_url = "https://arxiv.org" + pdf_url
                papers_urls.append(pdf_url)

    paper_content = []
    for url_ in papers_urls[:2]:
        loader = PyPDFLoader(url_)
        pages = loader.load_and_split()
        paper_text = ''
        for page in pages:
            page_text = remove_empty_lines(page.page_content.split('\n'))
            paper_text += ''.join(page_text)

        if paper_text:
            paper_content.append(paper_text)
    contents = [content for content in contents if content.strip()]
    paper_content = [content for content in paper_content if content.strip()]

    return contents[:3] + paper_content[:1]  # Amount of Content reduced due to get Faster response.


# Aim is to Update the Vector index after every search , But Update didn't happen in hugging face hub, so Minimalize the content for faster Vector Generation.


def file_nodes_vector():
    PERSIST_DIR_vector = "vector_index"

    if not os.path.exists(PERSIST_DIR_vector):
        os.makedirs(PERSIST_DIR_vector)

    try:
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR_vector)
        vector_index = load_index_from_storage(storage_context)

    except FileNotFoundError:

        documents = SimpleDirectoryReader(input_dir="research_papers/").load_data()

        nodes = hierarchical_split(documents)
        leaf_nodes = get_leaf_nodes(nodes)
        storage_context = storage_ctx(nodes)

        vector_index = create_vector_index(leaf_nodes, storage_context)
    return vector_index, storage_context


@st.cache_data
def response_generation(query, cohere_api_key, _vector_index, _storage_context, rank_top=7, similarity_cutoff_thr=0.80,
                        similarity_top_nodes=15):
    cohere_rerank = CohereRerank(api_key=cohere_api_key, top_n=rank_top)
    postprocessor = SimilarityPostprocessor(similarity_cutoff=similarity_cutoff_thr)

    base_retriever = _vector_index.as_retriever(similarity_top_k=similarity_top_nodes)
    retriever = AutoMergingRetriever(base_retriever, _storage_context, verbose=False)

    response_synthesizer = get_response_synthesizer()
    query_engine = RetrieverQueryEngine(retriever=retriever,
                                        node_postprocessors=[
                                            MetadataReplacementPostProcessor(target_metadata_key="window"),
                                            cohere_rerank, postprocessor], response_synthesizer=response_synthesizer)
    response = query_engine.query(query)
    return response


def func_add_new_article_content(content_):
    documents = [Document(text=t) for t in content_]

    new_nodes = hierarchical_split_research_paper_article(documents)

    new_leaf_nodes = get_leaf_nodes(new_nodes)
    new_storage_context = storage_ctx(new_nodes)
    new_vector_index = create_vector_index(new_leaf_nodes, new_storage_context)
    return new_vector_index, new_nodes, new_storage_context, new_leaf_nodes


def updating_vector(new_leaf_nodes):
    """
    Update didn't happen in with in hugging-space hub , Possible could be hierarchical_split storage Context not able to update ,
    when tries to update Vector index in local with splits like Semantic split , sentencesplit, simple node parser update works.
    If anyone Copying this, try to improve and Post the solution.
    """

    vector_index, storage_context = file_nodes_vector()
    vector_index.insert_nodes(new_leaf_nodes)
    saving_vectors(vector_index)

    st.session_state.vector_index = vector_index
    st.session_state.storage_context = storage_context


@st.cache_data
def generate_response_article_paper(query):
    content_ = get_article_and_arxiv_content(query)
    new_vector_index, new_nodes, storage_context, new_leaf_nodes = func_add_new_article_content(content_)
    rank_top = 10
    similarity_cutoff = 0.70
    similarity_top_nodes = 20
    response = response_generation(query, st.session_state.cohere_api_key, new_vector_index, storage_context, rank_top,
                                   similarity_cutoff, similarity_top_nodes)
    return response, new_nodes, new_leaf_nodes


def main():
    st.markdown("""<div style="text-align:center;"><h1 style="font-size: 30px;">Genie🧞 : RAG for AI Insights </h1></div>
                """, unsafe_allow_html=True)
    st.markdown("""<div style="text-align:center;"><h1 style="font-size: 17px;">"Interact with our real-time Q&A system, 
                    where you can ask questions on AI-related topics. If the system has the answer, it will respond immediately. 
                    Otherwise, it will fetch real-time information from the articles and research papers to provide you with the most up-to-date response.
                    During the initial run, there may be a delay as the vector embeddings are loaded into the session state."</h1></div>""", unsafe_allow_html=True)


    if 'key_flag' not in st.session_state:
        st.session_state.key_flag = False

    col_left, col_right = st.columns([1, 2])
    with (col_left):
        st.write("""<h1 style="font-size: 15px;">Enter your OpenAI API key </h1>""", unsafe_allow_html=True)
        openai_api_key = st.text_input(placeholder="OpenAI api key ", label=" ", type="password")

        st.write("""<h1 style="font-size: 15px;">Enter your SERP API key </h1>""", unsafe_allow_html=True)
        serp_api_key = st.text_input(placeholder="Serp api key ", label=" ", type="password")

        set_keys_button = st.button("Set Keys ", type="primary")

        try:
            if set_keys_button and openai_api_key and serp_api_key:
                setting_api_key(openai_api_key, serp_api_key)
                st.success("Successful πŸ‘")
                st.session_state.key_flag = True
            elif set_keys_button:
                st.warning("Please set the necessary API keys !")
        except Exception as e:
            st.warning(e)

    with col_right:
        st.write("""<h1 style="font-size: 15px;">Enter your Question </h1>""", unsafe_allow_html=True)
        query = st.text_input(placeholder="Ex : Explain Batch normalization ", label=" ")
        generate_response_button = st.button("Generate response", type="primary")

        if generate_response_button and st.session_state.key_flag and str(query):
            try :
                with st.spinner("Generating Response..."):
                    if "vector_index" in st.session_state and st.session_state["vector_index"] is not None and "storage_context" in st.session_state and st.session_state["storage_context"] is not None:
                        
                        response = response_generation(query, st.session_state.cohere_api_key,st.session_state.vector_index, st.session_state.storage_context)
    
                    else:
                        vector_index, storage_context = file_nodes_vector()
                        st.session_state.vector_index = vector_index
                        st.session_state.storage_context = storage_context
    
                        response = response_generation(query, st.session_state.cohere_api_key, vector_index,storage_context)
    
                if str(response) in ["Empty Response", "RAG Output"] or not response:
                    try :
                        with st.spinner("Getting Information from Articles and Research Papers, It will take some time..."):
                            paper_response, new_nodes, new_leaf_nodes = generate_response_article_paper(query)  
        
                        if paper_response:
                            st.write(str(paper_response))
                            generate_response_article_paper.clear()
        
                            col1, col2 = st.columns([1, 10])
                            thumps_up_button = col1.button("πŸ‘")
                            thumps_down_button = col2.button("πŸ‘Ž")
                            if thumps_up_button:
                                st.write("Thank you for your positive feedback!")
                                # updating_vector(new_leaf_nodes)
                            elif thumps_down_button:
                                st.write("""We're sorry , We will improve it.""")
        
                        elif str(paper_response) in ["Empty Response", "RAG Output"] or not paper_response:
                            st.write("RAG Couldn't get the results, it will be improved ") 
                    except Exception as e:
                        st.warning(e)

                elif response:
                    st.write(str(response))
                    response_generation.clear()
    
                    col1, col2 = st.columns([1, 10])
                    thumps_up_button = col1.button("πŸ‘")
                    thumps_down_button = col2.button("πŸ‘Ž")
                    if thumps_up_button:
                        st.write("Thank you for your positive feedback!")
                    elif thumps_down_button:
                        st.write("We're sorry , We will improve it.")

                else:
                    st.write("RAG Couldn't get the results, it will be improved.")  
            except Exception as e:
                st.warning(e)

        elif generate_response_button and not str(query) and not st.session_state.key_flag:
            st.warning("Please set the necessary API keys and Enter the query")

        elif generate_response_button and str(query) and not st.session_state.key_flag:
            st.warning("Please set the necessary API keys")

        elif generate_response_button and st.session_state.key_flag and not str(query):
            st.warning("Please Enter the query !")


if __name__ == "__main__":
    main()