File size: 4,388 Bytes
8a65e53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE 
from scipy.spatial.distance import cdist

st.markdown('<style>h1{color: white;}</style>', unsafe_allow_html=True)
st.title('Call on Doc Skin Care Product Recommender')
st.write('Find the Right Skin Care for you')

st.write("Hi there! If you have a skincare product you currently like I can help you find a similar one based on the ingredients.")

st.write('Please select a product below so I can recommend similar ones')
# Load the data
df = pd.read_csv("./data/cosmetics.csv")

# Choose a product category
category = st.selectbox(label='Select a product category', options= df['Label'].unique() )
category_subset = df[df['Label'] == category]
# Choose a brand
brand = st.selectbox(label='Select a brand', options= sorted(category_subset['Brand'].unique()))
category_brand_subset = category_subset[category_subset['Brand'] == brand]
# Choose product
product = st.selectbox(label='Select the product', options= sorted(category_brand_subset['Name'].unique() ))

#skin_type = st.selectbox(label='Select your skin type', options= ['Combination',
#       'Dry', 'Normal', 'Oily', 'Sensitive'] )

## Helper functions
# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]


if category is not None:
    category_subset = df[df['Label'] == category]

if product is not None:
    #skincare_type = category_subset[category_subset[str(skin_type)] == 1]

    # Reset index
    category_subset = category_subset.reset_index(drop=True)

    # Display data frame
    #st.dataframe(category_subset)

    # Initialize dictionary, list, and initial index
    ingredient_idx = {}
    corpus = []
    idx = 0

    # For loop for tokenization
    for i in range(len(category_subset)):    
        ingredients = category_subset['Ingredients'][i]
        ingredients_lower = ingredients.lower()
        tokens = ingredients_lower.split(', ')
        corpus.append(tokens)
        for ingredient in tokens:
            if ingredient not in ingredient_idx:
                ingredient_idx[ingredient] = idx
                idx += 1

                
    # Get the number of items and tokens 
    M = len(category_subset)
    N = len(ingredient_idx)

    # Initialize a matrix of zeros
    A = np.zeros((M,N))

    # Make a document-term matrix
    i = 0
    for tokens in corpus:
        A[i, :] = oh_encoder(tokens)
        i +=1

model_run = st.button('Find similar products!')


if model_run:

    st.write('Based on the ingredients of the product you selected')
    st.write('here are the top 10 products that are the most similar :sparkles:')
    
    # Run the model
    model = TSNE(n_components = 2, learning_rate = 150, random_state = 42)
    tsne_features = model.fit_transform(A)

    # Make X, Y columns 
    category_subset['X'] = tsne_features[:, 0]
    category_subset['Y'] = tsne_features[:, 1]

    target = category_subset[category_subset['Name'] == product]

    target_x = target['X'].values[0]
    target_y = target['Y'].values[0]

    df1 = pd.DataFrame()
    df1['point'] = [(x, y) for x,y in zip(category_subset['X'], category_subset['Y'])]

    category_subset['distance'] = [cdist(np.array([[target_x,target_y]]), np.array([product]), metric='euclidean') for product in df1['point']]

    # arrange by descending order
    top_matches = category_subset.sort_values(by=['distance'])

    # Compute ingredients in common
    target_ingredients = target.Ingredients.values
    c1_list = target_ingredients[0].split(",")
    c1_list = [x.strip(' ') for x in c1_list]
    c1_set = set(c1_list)

    top_matches['Ingredients in common'] = [c1_set.intersection( set([x.strip(' ')for x in product.split(",")]) ) for product in top_matches['Ingredients']]

    # Select relevant columns
    top_matches = top_matches[['Label', 'Brand', 'Name', 'Price', 'Ingredients','Ingredients in common']]
    top_matches = top_matches.reset_index(drop=True)
    top_matches = top_matches.drop(top_matches.index[0])

    st.dataframe(top_matches.head(10))