semantic-search / app.py
hellorahulk's picture
Update app.py
d6b4f91
import gradio as gr
from smart_open import open
import gensim
from gensim.similarities.annoy import AnnoyIndexer
import plotly.express as px
import pandas as pd
import numpy as np
import pacmap
# Load into gensim model
def load_gensim(fname):
model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=False)
# Search using Annoy indexer; Faster method
annoy_index = AnnoyIndexer(model, 100)
return model, annoy_index
def searchNexplore(word, final_dfs, model, annoy_index, topn):
vector = model[word]
approximate_neighbors = model.most_similar([vector], topn=topn, indexer=annoy_index)
rows = []
for row in approximate_neighbors:
rows.append(row[0])
searched_df = final_dfs.loc[rows]
return searched_df, approximate_neighbors
def embedding_dim_reduction(
embeddings, n_dim=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0
):
"""
Perform PaCMAP dimention reduction
Selection of values :
1. Default transorms MN_ratio=0.5, FP_ratio=2.0
2. For heavy transformations MN_ratio=30, FP_ratio=100.0
"""
reducer = pacmap.PaCMAP(
n_components=n_dim,
n_neighbors=n_neighbors,
MN_ratio=MN_ratio,
FP_ratio=FP_ratio,
lr=0.05,
num_iters=1000,
verbose=False,
)
reduced_embeddings = reducer.fit_transform(embeddings, init="pca")
return reduced_embeddings
model, annoy_index = load_gensim("embedding_dump.txt")
final_dfs = pd.read_csv("raw_embeddings_allinone.csv")
final_dfs.set_index("Unnamed: 0", inplace=True)
def get_semantic(input_text, topn):
searched_df, approximate_neighbors = searchNexplore(
input_text, final_dfs, model, annoy_index, topn
)
reduced_embeddings = embedding_dim_reduction(
searched_df, n_dim=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0
)
fig1 = px.scatter(
x=reduced_embeddings[:, 0],
y=reduced_embeddings[:, 1],
hover_name=searched_df.index.tolist(),
color=searched_df.index.tolist(),
)
reduced_embeddings = embedding_dim_reduction(
searched_df, n_dim=3, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0
)
fig2 = px.scatter_3d(
x=reduced_embeddings[:, 0],
y=reduced_embeddings[:, 1],
z=reduced_embeddings[:, 2],
hover_name=searched_df.index.tolist(),
color=searched_df.index.tolist(),
)
return fig1, fig2, approximate_neighbors
iface = gr.Interface(
fn=get_semantic,
inputs=[
"text",
gr.Slider(0, 1000, value=100),
],
outputs=["plot", "plot", "list"],
examples=[["SOPA_CANJA_C/ALETRIA_MAGGI_82GR", 100]],
title="Sentiment Explorer",
description="Get Sentiment search results",
theme="peach",
).launch(inline=False)