Spaces:
Sleeping
Sleeping
File size: 17,705 Bytes
a3844fa fe29cec a3844fa fe29cec abf0d78 fe29cec abf0d78 fe29cec 52ded98 abf0d78 52ded98 fe29cec 52ded98 fe29cec abf0d78 fe29cec abf0d78 fe29cec abf0d78 fe29cec 69f5a0f fe29cec 69f5a0f 30549d9 fe29cec 30549d9 69f5a0f fe29cec 69f5a0f abf0d78 30549d9 abf0d78 fe29cec 1253858 abf0d78 fe29cec 3418ad9 abf0d78 69f5a0f 1253858 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 aeacfd0 abf0d78 fe29cec abf0d78 3418ad9 abf0d78 fe29cec 3418ad9 fe29cec abf0d78 fe29cec 69f5a0f abf0d78 c711d65 fe29cec 1253858 abf0d78 1253858 ca8499e abf0d78 922048a 0cc8106 fe29cec abf0d78 fe29cec 1253858 fe29cec abf0d78 fe29cec abf0d78 fe29cec c861239 fe29cec c861239 fe29cec c711d65 fe29cec c711d65 fe29cec 52ded98 1253858 52ded98 fe29cec 52ded98 abf0d78 9827cc8 fe29cec 868799e 9827cc8 868799e fe29cec abf0d78 fe29cec abf0d78 fe29cec 69f5a0f fe29cec abf0d78 52ded98 abf0d78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 |
"""
The primary function of this application is artificial intelligence-based question answering. In the dynamic landscape of AI, new
technologies and trends constantly emerge, rendering conventional data insufficient to address real-time challenges. To tackle this
issue, the application leverages advanced Retrieval Augmented Generation (RAG) and content scraping techniques. Despite having limited
knowledge due to storage and cost constraints with OpenAI, the application possesses some understanding of data science stored in vector
format. When users inquire about topics not covered in the custom data, the application utilizes SERPAPI and advanced RAG methods to
incorporate unavailable context and resolve knowledge gaps effectively. The project initially utilizes pre-generated embeddings from a small pool of research papers .
However, generating content from recent articles and research papers, which require new vector embeddings each time, presents challenges.
The project aims to mitigate costs, minimize hallucinations, and enhance accuracy in its approach.
"""
import os
import os.path
import serpapi
import requests
import feedparser
import streamlit as st
from typing import List
from docx import Document
from bs4 import BeautifulSoup
import huggingface_hub as hfh
from urllib.parse import quote
from llama_index.llms.openai import OpenAI
from langchain_community.document_loaders import WebBaseLoader
from llama_index.embeddings.openai import OpenAIEmbedding
from langchain_community.document_loaders import PyPDFLoader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.node_parser import get_leaf_nodes, HierarchicalNodeParser, get_root_nodes, SentenceSplitter
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SimilarityPostprocessor
from llama_index.core import (VectorStoreIndex, SimpleDirectoryReader, ServiceContext, load_index_from_storage,
StorageContext, Document, Settings, get_response_synthesizer, set_global_service_context)
import warnings
warnings.filterwarnings("ignore")
if 'vector_index' not in st.session_state:
st.session_state.vector_index = None
if 'cohere_api_key' not in st.session_state:
st.session_state.cohere_api_key = None
if 'serp_api_key' not in st.session_state:
st.session_state.serp_api_key = None
if 'storage_context' not in st.session_state:
st.session_state.storage_context = None
st.set_page_config(
page_title="Quik Querium AI Genie",
page_icon="π§",
layout="wide",
initial_sidebar_state="expanded",
)
def setting_api_key(openai_api_key, serp_api_key):
try:
os.environ['OPENAI_API_KEY'] = openai_api_key
st.session_state.hf_token = os.getenv("hf_token")
hfh.login(token=st.session_state.hf_token)
os.environ["COHERE_API_KEY"] = os.getenv("cohere_api_key")
st.session_state.serp_api_key = serp_api_key
except Exception as e:
st.warning(e)
template = """<|system|>
you are a Question answering system based AI, Machine Learning , Deep Learning , Generative AI,
Data science, Data Analytics and Mathematics.
Mention Clearly Before response " RAG Output :\n".
Please check if the following pieces of context has any mention of the keywords provided
in the question.Generate response as much as you could with context you get.
if the following pieces of Context does not relate to Question, You must not answer on your own, you don't know the answer,
</s>
<|user|>
Question:{query_str}</s>
<|assistant|> """
Settings.llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0.1, model_kwargs={'trust_remote_code': True},
max_tokens=512, system_prompt=template)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
service_context = ServiceContext.from_defaults(embed_model=Settings.embed_model, llm=Settings.llm)
set_global_service_context(service_context)
def hierarchical_split(documents):
node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128])
nodes = node_parser.get_nodes_from_documents(documents)
return nodes
def hierarchical_split_research_paper_article(documents):
node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[512, 256, 64])
nodes = node_parser.get_nodes_from_documents(documents)
return nodes
def storage_ctx(nodes):
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)
return storage_context
def saving_vectors(vector_index): # Can't able to save Vectors in Huggingface Space
vector_index.storage_context.persist(persist_dir="vector_index/")
def create_vector_index(nodes, storage_context):
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
# saving_vectors(vector_index) # In hugging face the saving function didn't work
return vector_index
def search_arxiv(query, max_results=8):
encoded_query = quote(query)
base_url = 'http://export.arxiv.org/api/query?'
query_url = f'{base_url}search_query={encoded_query}&start=0&max_results={max_results}'
feed = feedparser.parse(query_url)
papers = []
for entry in feed.entries:
paper_info = {
'Title': entry.title,
'URL': entry.link
}
papers.append(paper_info)
return papers
def remove_empty_lines(lines):
non_empty_lines = [line for line in lines if line.strip()]
return ' '.join(non_empty_lines)
def get_article_and_arxiv_content(query):
# Article content
serpapi_api_key = st.session_state.serp_api_key
search_engine = "google" # bing
params = {
"engine": "google",
"gl": "us",
"hl": "en",
"api_key": serpapi_api_key,
"q": query
}
serpapi_wrapper = serpapi.GoogleSearch(params)
search_results = serpapi_wrapper.get_dict()
results = []
for result_type in ["organic_results", "related_questions"]:
if result_type in search_results:
for result in search_results[result_type]:
if "title" in result and "link" in result:
# Extract title and link
item = {"title": result["title"], "link": result["link"]}
results.append(item)
# Store Each article links in List
links = [result['link'] for result in results]
contents = []
for link in links:
response = requests.get(link)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
content_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
document = ""
for tag in content_tags:
document += tag.text + "\n"
if not document:
loader = WebBaseLoader(link)
document_ = loader.load()
document = document_[0].page_content
article = remove_empty_lines(document.split('\n'))
contents.append(article)
# arXiv content
papers_to_download = search_arxiv(query)
papers_urls = []
for paper in papers_to_download:
page_url = paper['URL']
response = requests.get(page_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
download_link = soup.find("a", class_="abs-button download-pdf")
if download_link:
pdf_url = download_link['href']
if not pdf_url.startswith("http"):
pdf_url = "https://arxiv.org" + pdf_url
papers_urls.append(pdf_url)
paper_content = []
for url_ in papers_urls[:2]:
loader = PyPDFLoader(url_)
pages = loader.load_and_split()
paper_text = ''
for page in pages:
page_text = remove_empty_lines(page.page_content.split('\n'))
paper_text += ''.join(page_text)
if paper_text:
paper_content.append(paper_text)
contents = [content for content in contents if content.strip()]
paper_content = [content for content in paper_content if content.strip()]
return contents[:3] + paper_content[:1] # Amount of Content reduced due to get Faster response.
# Aim is to Update the Vector index after every search , But Update didn't happen in hugging face hub, so Minimalize the content for faster Vector Generation.
def file_nodes_vector():
PERSIST_DIR_vector = "vector_index"
if not os.path.exists(PERSIST_DIR_vector):
os.makedirs(PERSIST_DIR_vector)
try:
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR_vector)
vector_index = load_index_from_storage(storage_context)
except FileNotFoundError:
documents = SimpleDirectoryReader(input_dir="research_papers/").load_data()
nodes = hierarchical_split(documents)
leaf_nodes = get_leaf_nodes(nodes)
storage_context = storage_ctx(nodes)
vector_index = create_vector_index(leaf_nodes, storage_context)
return vector_index, storage_context
@st.cache_data
def response_generation(query, cohere_api_key, _vector_index, _storage_context, rank_top=7, similarity_cutoff_thr=0.80,
similarity_top_nodes=15):
cohere_rerank = CohereRerank(api_key=cohere_api_key, top_n=rank_top)
postprocessor = SimilarityPostprocessor(similarity_cutoff=similarity_cutoff_thr)
base_retriever = _vector_index.as_retriever(similarity_top_k=similarity_top_nodes)
retriever = AutoMergingRetriever(base_retriever, _storage_context, verbose=False)
response_synthesizer = get_response_synthesizer()
query_engine = RetrieverQueryEngine(retriever=retriever,
node_postprocessors=[
MetadataReplacementPostProcessor(target_metadata_key="window"),
cohere_rerank, postprocessor], response_synthesizer=response_synthesizer)
response = query_engine.query(query)
return response
def func_add_new_article_content(content_):
documents = [Document(text=t) for t in content_]
new_nodes = hierarchical_split_research_paper_article(documents)
new_leaf_nodes = get_leaf_nodes(new_nodes)
new_storage_context = storage_ctx(new_nodes)
new_vector_index = create_vector_index(new_leaf_nodes, new_storage_context)
return new_vector_index, new_nodes, new_storage_context, new_leaf_nodes
def updating_vector(new_leaf_nodes):
"""
Update didn't happen in with in hugging-space hub , Possible could be hierarchical_split storage Context not able to update ,
when tries to update Vector index in local with splits like Semantic split , sentencesplit, simple node parser update works.
If anyone Copying this, try to improve and Post the solution.
"""
vector_index, storage_context = file_nodes_vector()
vector_index.insert_nodes(new_leaf_nodes)
saving_vectors(vector_index)
st.session_state.vector_index = vector_index
st.session_state.storage_context = storage_context
@st.cache_data
def generate_response_article_paper(query):
content_ = get_article_and_arxiv_content(query)
new_vector_index, new_nodes, storage_context, new_leaf_nodes = func_add_new_article_content(content_)
rank_top = 10
similarity_cutoff = 0.70
similarity_top_nodes = 20
response = response_generation(query, st.session_state.cohere_api_key, new_vector_index, storage_context, rank_top,
similarity_cutoff, similarity_top_nodes)
return response, new_nodes, new_leaf_nodes
def main():
st.markdown("""<div style="text-align:center;"><h1 style="font-size: 30px;">Genieπ§ : RAG for AI Insights </h1></div>
""", unsafe_allow_html=True)
st.markdown("""<div style="text-align:center;"><h1 style="font-size: 17px;">"Interact with our real-time Q&A system,
where you can ask questions on AI-related topics. If the system has the answer, it will respond immediately.
Otherwise, it will fetch real-time information from the articles and research papers to provide you with the most up-to-date response.
During the initial run, there may be a delay as the vector embeddings are loaded into the session state."</h1></div>""", unsafe_allow_html=True)
if 'key_flag' not in st.session_state:
st.session_state.key_flag = False
col_left, col_right = st.columns([1, 2])
with (col_left):
st.write("""<h1 style="font-size: 15px;">Enter your OpenAI API key </h1>""", unsafe_allow_html=True)
openai_api_key = st.text_input(placeholder="OpenAI api key ", label=" ", type="password")
st.write("""<h1 style="font-size: 15px;">Enter your SERP API key </h1>""", unsafe_allow_html=True)
serp_api_key = st.text_input(placeholder="Serp api key ", label=" ", type="password")
set_keys_button = st.button("Set Keys ", type="primary")
try:
if set_keys_button and openai_api_key and serp_api_key:
setting_api_key(openai_api_key, serp_api_key)
st.success("Successful π")
st.session_state.key_flag = True
elif set_keys_button:
st.warning("Please set the necessary API keys !")
except Exception as e:
st.warning(e)
with col_right:
st.write("""<h1 style="font-size: 15px;">Enter your Question </h1>""", unsafe_allow_html=True)
query = st.text_input(placeholder="Ex : Explain Batch normalization ", label=" ")
generate_response_button = st.button("Generate response", type="primary")
if generate_response_button and st.session_state.key_flag and str(query):
try :
with st.spinner("Generating Response..."):
if "vector_index" in st.session_state and st.session_state["vector_index"] is not None and "storage_context" in st.session_state and st.session_state["storage_context"] is not None:
response = response_generation(query, st.session_state.cohere_api_key,st.session_state.vector_index, st.session_state.storage_context)
else:
vector_index, storage_context = file_nodes_vector()
st.session_state.vector_index = vector_index
st.session_state.storage_context = storage_context
response = response_generation(query, st.session_state.cohere_api_key, vector_index,storage_context)
if str(response) in ["Empty Response", "RAG Output"] or not response:
try :
with st.spinner("Getting Information from Articles and Research Papers, It will take some time..."):
paper_response, new_nodes, new_leaf_nodes = generate_response_article_paper(query)
if paper_response:
st.write(str(paper_response))
generate_response_article_paper.clear()
col1, col2 = st.columns([1, 10])
thumps_up_button = col1.button("π")
thumps_down_button = col2.button("π")
if thumps_up_button:
st.write("Thank you for your positive feedback!")
# updating_vector(new_leaf_nodes)
elif thumps_down_button:
st.write("""We're sorry , We will improve it.""")
elif str(paper_response) in ["Empty Response", "RAG Output"] or not paper_response:
st.write("RAG Couldn't get the results, it will be improved ")
except Exception as e:
st.warning(e)
elif response:
st.write(str(response))
response_generation.clear()
col1, col2 = st.columns([1, 10])
thumps_up_button = col1.button("π")
thumps_down_button = col2.button("π")
if thumps_up_button:
st.write("Thank you for your positive feedback!")
elif thumps_down_button:
st.write("We're sorry , We will improve it.")
else:
st.write("RAG Couldn't get the results, it will be improved.")
except Exception as e:
st.warning(e)
elif generate_response_button and not str(query) and not st.session_state.key_flag:
st.warning("Please set the necessary API keys and Enter the query")
elif generate_response_button and str(query) and not st.session_state.key_flag:
st.warning("Please set the necessary API keys")
elif generate_response_button and st.session_state.key_flag and not str(query):
st.warning("Please Enter the query !")
if __name__ == "__main__":
main()
|