File size: 2,789 Bytes
b01727d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6b4f91
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr

from smart_open import open
import gensim
from gensim.similarities.annoy import AnnoyIndexer
import plotly.express as px
import pandas as pd
import numpy as np
import pacmap


# Load into gensim model
def load_gensim(fname):
    model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=False)
    # Search using Annoy indexer; Faster method
    annoy_index = AnnoyIndexer(model, 100)
    return model, annoy_index


def searchNexplore(word, final_dfs, model, annoy_index, topn):

    vector = model[word]
    approximate_neighbors = model.most_similar([vector], topn=topn, indexer=annoy_index)
    rows = []
    for row in approximate_neighbors:
        rows.append(row[0])
    searched_df = final_dfs.loc[rows]
    return searched_df, approximate_neighbors


def embedding_dim_reduction(
    embeddings, n_dim=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0
):
    """
    Perform PaCMAP dimention reduction

    Selection of values :
    1. Default transorms MN_ratio=0.5, FP_ratio=2.0
    2. For heavy transformations MN_ratio=30, FP_ratio=100.0
    """
    reducer = pacmap.PaCMAP(
        n_components=n_dim,
        n_neighbors=n_neighbors,
        MN_ratio=MN_ratio,
        FP_ratio=FP_ratio,
        lr=0.05,
        num_iters=1000,
        verbose=False,
    )

    reduced_embeddings = reducer.fit_transform(embeddings, init="pca")
    return reduced_embeddings


model, annoy_index = load_gensim("embedding_dump.txt")
final_dfs = pd.read_csv("raw_embeddings_allinone.csv")
final_dfs.set_index("Unnamed: 0", inplace=True)


def get_semantic(input_text, topn):

    searched_df, approximate_neighbors = searchNexplore(
        input_text, final_dfs, model, annoy_index, topn
    )

    reduced_embeddings = embedding_dim_reduction(
        searched_df, n_dim=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0
    )

    fig1 = px.scatter(
        x=reduced_embeddings[:, 0],
        y=reduced_embeddings[:, 1],
        hover_name=searched_df.index.tolist(),
        color=searched_df.index.tolist(),
    )

    reduced_embeddings = embedding_dim_reduction(
        searched_df, n_dim=3, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0
    )

    fig2 = px.scatter_3d(
        x=reduced_embeddings[:, 0],
        y=reduced_embeddings[:, 1],
        z=reduced_embeddings[:, 2],
        hover_name=searched_df.index.tolist(),
        color=searched_df.index.tolist(),
    )

    return fig1, fig2, approximate_neighbors

 
iface = gr.Interface(
    fn=get_semantic,
    inputs=[
        "text",
        gr.Slider(0, 1000, value=100),
    ],
    outputs=["plot", "plot", "list"],
    examples=[["SOPA_CANJA_C/ALETRIA_MAGGI_82GR", 100]],
    title="Sentiment Explorer",
    description="Get Sentiment search results",
    theme="peach",
).launch(inline=False)