import gradio as gr from smart_open import open import gensim from gensim.similarities.annoy import AnnoyIndexer import plotly.express as px import pandas as pd import numpy as np import pacmap # Load into gensim model def load_gensim(fname): model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=False) # Search using Annoy indexer; Faster method annoy_index = AnnoyIndexer(model, 100) return model, annoy_index def searchNexplore(word, final_dfs, model, annoy_index, topn): vector = model[word] approximate_neighbors = model.most_similar([vector], topn=topn, indexer=annoy_index) rows = [] for row in approximate_neighbors: rows.append(row[0]) searched_df = final_dfs.loc[rows] return searched_df, approximate_neighbors def embedding_dim_reduction( embeddings, n_dim=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0 ): """ Perform PaCMAP dimention reduction Selection of values : 1. Default transorms MN_ratio=0.5, FP_ratio=2.0 2. For heavy transformations MN_ratio=30, FP_ratio=100.0 """ reducer = pacmap.PaCMAP( n_components=n_dim, n_neighbors=n_neighbors, MN_ratio=MN_ratio, FP_ratio=FP_ratio, lr=0.05, num_iters=1000, verbose=False, ) reduced_embeddings = reducer.fit_transform(embeddings, init="pca") return reduced_embeddings model, annoy_index = load_gensim("embedding_dump.txt") final_dfs = pd.read_csv("raw_embeddings_allinone.csv") final_dfs.set_index("Unnamed: 0", inplace=True) def get_semantic(input_text, topn): searched_df, approximate_neighbors = searchNexplore( input_text, final_dfs, model, annoy_index, topn ) reduced_embeddings = embedding_dim_reduction( searched_df, n_dim=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0 ) fig1 = px.scatter( x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hover_name=searched_df.index.tolist(), color=searched_df.index.tolist(), ) reduced_embeddings = embedding_dim_reduction( searched_df, n_dim=3, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0 ) fig2 = px.scatter_3d( x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], z=reduced_embeddings[:, 2], hover_name=searched_df.index.tolist(), color=searched_df.index.tolist(), ) return fig1, fig2, approximate_neighbors iface = gr.Interface( fn=get_semantic, inputs=[ "text", gr.Slider(0, 1000, value=100), ], outputs=["plot", "plot", "list"], examples=[["SOPA_CANJA_C/ALETRIA_MAGGI_82GR", 100]], title="Sentiment Explorer", description="Get Sentiment search results", theme="peach", ).launch(inline=False)