File size: 1,798 Bytes
a526bde
 
 
82c9cb9
a526bde
 
 
 
 
 
82c9cb9
 
 
a526bde
 
 
82c9cb9
 
 
 
 
 
 
 
 
ae9afb6
 
 
 
 
 
 
 
46dd054
ae9afb6
469f1ff
b4dd41c
ae9afb6
 
82c9cb9
a526bde
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from llama_index import VectorStoreIndex, SimpleDirectoryReader
import os

# Load BERT model and create vectorizer
model_name = "bert-base-nli-mean-tokens"
model = SentenceTransformer(model_name)
vectorizer = model.encode

# Create docs folder if it doesn't exist
docs_path = "docs"
os.makedirs(docs_path, exist_ok=True)

# Streamlit app layout
st.title("Semantic and Similarity Search with BERT")

# File upload functionality
uploaded_file = st.file_uploader("Upload a text file", type=["txt"])

if uploaded_file is not None:
    with open(os.path.join(docs_path, uploaded_file.name), "wb") as f:
        f.write(uploaded_file.getbuffer())

    # Reload documents and index after file upload
    documents = SimpleDirectoryReader(
                                        input_dir=docs_path,
                                        filename_as_id=True,
                                        required_exts=[".txt"]
                                     ).load_data()

    # Creating embedding
    document_embeddings = [vectorizer(doc.text) for doc in documents]
    
    # Creating Index
    index = VectorStoreIndex(document_embeddings=document_embeddings)

    # Sucess message
    st.success("File uploaded & processed successfully!")

query = st.text_input("Enter your search query:")

if query:
    # Semantic search (approximated)
    most_similar_doc = index.similarity_search(query)[0]
    st.subheader("Semantic Search Results")
    st.write(most_similar_doc.text)

    # Similarity search
    similar_documents = index.similarity_search(query, k=5)  # Show top 5 results
    st.subheader("Similarity Search Results")
    for doc in similar_documents:
        st.write(f"- {doc.id}: {doc.text[:50]}...")