import streamlit as st
import os
from utils import (
parse_docx,
parse_pdf,
parse_txt,
parse_csv,
search_docs,
embed_docs,
text_to_docs,
get_sources,
)
from openai.error import OpenAIError
def clear_submit():
st.session_state["submit"] = False
def set_openai_api_key(api_key: str):
st.session_state["OPENAI_API_KEY"] = api_key
st.markdown('
Semantic Search 🔍 by Code GPT
', unsafe_allow_html=True)
# Sidebar
index = None
doc = None
with st.sidebar:
user_secret = st.text_input(
"OpenAI API Key",
type="password",
placeholder="Paste your OpenAI API key here (sk-...)",
help="You can get your API key from https://platform.openai.com/account/api-keys.",
value=st.session_state.get("OPENAI_API_KEY", ""),
)
if user_secret:
set_openai_api_key(user_secret)
uploaded_file = st.file_uploader(
"Upload a pdf, docx, or txt file",
type=["pdf", "docx", "txt", "csv"],
help="Scanned documents are not supported yet!",
on_change=clear_submit,
)
if uploaded_file is not None:
if uploaded_file.name.endswith(".pdf"):
doc = parse_pdf(uploaded_file)
elif uploaded_file.name.endswith(".docx"):
doc = parse_docx(uploaded_file)
elif uploaded_file.name.endswith(".csv"):
doc = parse_csv(uploaded_file)
elif uploaded_file.name.endswith(".txt"):
doc = parse_txt(uploaded_file)
else:
st.error("File type not supported")
doc = None
text = text_to_docs(doc)
try:
with st.spinner("Indexing document... This may take a while⏳"):
result = embed_docs(text)
index = result[0]
embeddings = result[1]
st.session_state["api_key_configured"] = True
except OpenAIError as e:
st.error(e._message)
tab1, tab2 = st.tabs(["Intro", "Semantic Search"])
with tab1:
st.markdown("### Semantic Search with cosine similarity")
st.write("Cosine similarity is a technique used to measure the similarity between two vectors. In the context of OpenAI's embedding API, cosine similarity is used to compare the similarity between two pieces of text based on their underlying vector representations.")
st.markdown('
', unsafe_allow_html=True)
st.write("### Here's how it works:")
st.write("1. First, the embedding API converts each piece of text into a vector representation using a pre-trained language model. This vector represents the meaning and context of the text.")
st.write("2. The cosine similarity function then takes these two vectors and calculates the cosine of the angle between them. The cosine similarity score ranges from -1 to 1, where 1 indicates that the two vectors are identical, 0 indicates that they are completely dissimilar, and -1 indicates that they are exact opposites.")
st.write("3. This cosine similarity score is then used to determine the similarity between the two pieces of text. For example, if the cosine similarity score is close to 1, the two pieces of text are likely very similar in meaning, while a score close to 0 suggests that they are completely different.")
st.write("Overall, cosine similarity is a powerful tool for comparing the semantic similarity between two pieces of text, and OpenAI's embedding API makes it easy to implement this technique in your own projects.")
st.markdown("""---""")
st.markdown("## Semantic Search was written with the following tools:")
st.markdown("#### Code GPT")
st.write("All code was written with the help of Code GPT. Visit https://codegpt.co to get the extension.", unsafe_allow_html=True)
st.markdown("#### Streamlit")
st.write("The design was written with Streamlit.", unsafe_allow_html=True)
st.markdown("#### LangChain")
st.markdown('Embeddings is done via the OpenAI API with "text-embedding-ada-002" and LangChain.', unsafe_allow_html=True)
st.markdown("FAISS Facebook AI Similarity Search is a library for efficient similarity search and clustering of dense vectors.", unsafe_allow_html=True)
st.markdown("""---""")
st.write('Author: Daniel Avila', unsafe_allow_html=True)
st.write('Repo: Github', unsafe_allow_html=True)
st.write("This software was developed with Code GPT, for more information visit: https://codegpt.co")
with tab2:
st.write('To obtain an API Key you must create an OpenAI account at the following link: https://openai.com/api/')
query = st.text_area("Ask a question about the document", on_change=clear_submit)
button = st.button("Submit")
if button or st.session_state.get("submit"):
if not query:
st.error("Please enter a question!")
else:
st.session_state["submit"] = True
sources = search_docs(index, query)
st.markdown("#### Sources")
for source in sources:
st.markdown(source.page_content)
st.markdown(source.metadata["source"])
st.markdown("---")