Spaces:

rahgadda
/

shakesearch

Sleeping

shakesearch / shakesearch.py

Initial Draft

469f1ff verified over 1 year ago

1.8 kB

	import streamlit as st
	from sentence_transformers import SentenceTransformer, util
	from llama_index import VectorStoreIndex, SimpleDirectoryReader
	import os

	# Load BERT model and create vectorizer
	model_name = "bert-base-nli-mean-tokens"
	model = SentenceTransformer(model_name)
	vectorizer = model.encode

	# Create docs folder if it doesn't exist
	docs_path = "docs"
	os.makedirs(docs_path, exist_ok=True)

	# Streamlit app layout
	st.title("Semantic and Similarity Search with BERT")

	# File upload functionality
	uploaded_file = st.file_uploader("Upload a text file", type=["txt"])

	if uploaded_file is not None:
	with open(os.path.join(docs_path, uploaded_file.name), "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Reload documents and index after file upload
	documents = SimpleDirectoryReader(
	input_dir=docs_path,
	filename_as_id=True,
	required_exts=[".txt"]
	).load_data()

	# Creating embedding
	document_embeddings = [vectorizer(doc.text) for doc in documents]

	# Creating Index
	index = VectorStoreIndex(document_embeddings=document_embeddings)

	# Sucess message
	st.success("File uploaded & processed successfully!")

	query = st.text_input("Enter your search query:")

	if query:
	# Semantic search (approximated)
	most_similar_doc = index.similarity_search(query)[0]
	st.subheader("Semantic Search Results")
	st.write(most_similar_doc.text)

	# Similarity search
	similar_documents = index.similarity_search(query, k=5) # Show top 5 results
	st.subheader("Similarity Search Results")
	for doc in similar_documents:
	st.write(f"- {doc.id}: {doc.text[:50]}...")