File size: 4,388 Bytes
8a65e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist
st.markdown('<style>h1{color: white;}</style>', unsafe_allow_html=True)
st.title('Call on Doc Skin Care Product Recommender')
st.write('Find the Right Skin Care for you')
st.write("Hi there! If you have a skincare product you currently like I can help you find a similar one based on the ingredients.")
st.write('Please select a product below so I can recommend similar ones')
# Load the data
df = pd.read_csv("./data/cosmetics.csv")
# Choose a product category
category = st.selectbox(label='Select a product category', options= df['Label'].unique() )
category_subset = df[df['Label'] == category]
# Choose a brand
brand = st.selectbox(label='Select a brand', options= sorted(category_subset['Brand'].unique()))
category_brand_subset = category_subset[category_subset['Brand'] == brand]
# Choose product
product = st.selectbox(label='Select the product', options= sorted(category_brand_subset['Name'].unique() ))
#skin_type = st.selectbox(label='Select your skin type', options= ['Combination',
# 'Dry', 'Normal', 'Oily', 'Sensitive'] )
## Helper functions
# Define the oh_encoder function
def oh_encoder(tokens):
x = np.zeros(N)
for ingredient in tokens:
# Get the index for each ingredient
idx = ingredient_idx[ingredient]
# Put 1 at the corresponding indices
x[idx] = 1
return x
def closest_point(point, points):
""" Find closest point from a list of points. """
return points[cdist([point], points).argmin()]
if category is not None:
category_subset = df[df['Label'] == category]
if product is not None:
#skincare_type = category_subset[category_subset[str(skin_type)] == 1]
# Reset index
category_subset = category_subset.reset_index(drop=True)
# Display data frame
#st.dataframe(category_subset)
# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0
# For loop for tokenization
for i in range(len(category_subset)):
ingredients = category_subset['Ingredients'][i]
ingredients_lower = ingredients.lower()
tokens = ingredients_lower.split(', ')
corpus.append(tokens)
for ingredient in tokens:
if ingredient not in ingredient_idx:
ingredient_idx[ingredient] = idx
idx += 1
# Get the number of items and tokens
M = len(category_subset)
N = len(ingredient_idx)
# Initialize a matrix of zeros
A = np.zeros((M,N))
# Make a document-term matrix
i = 0
for tokens in corpus:
A[i, :] = oh_encoder(tokens)
i +=1
model_run = st.button('Find similar products!')
if model_run:
st.write('Based on the ingredients of the product you selected')
st.write('here are the top 10 products that are the most similar :sparkles:')
# Run the model
model = TSNE(n_components = 2, learning_rate = 150, random_state = 42)
tsne_features = model.fit_transform(A)
# Make X, Y columns
category_subset['X'] = tsne_features[:, 0]
category_subset['Y'] = tsne_features[:, 1]
target = category_subset[category_subset['Name'] == product]
target_x = target['X'].values[0]
target_y = target['Y'].values[0]
df1 = pd.DataFrame()
df1['point'] = [(x, y) for x,y in zip(category_subset['X'], category_subset['Y'])]
category_subset['distance'] = [cdist(np.array([[target_x,target_y]]), np.array([product]), metric='euclidean') for product in df1['point']]
# arrange by descending order
top_matches = category_subset.sort_values(by=['distance'])
# Compute ingredients in common
target_ingredients = target.Ingredients.values
c1_list = target_ingredients[0].split(",")
c1_list = [x.strip(' ') for x in c1_list]
c1_set = set(c1_list)
top_matches['Ingredients in common'] = [c1_set.intersection( set([x.strip(' ')for x in product.split(",")]) ) for product in top_matches['Ingredients']]
# Select relevant columns
top_matches = top_matches[['Label', 'Brand', 'Name', 'Price', 'Ingredients','Ingredients in common']]
top_matches = top_matches.reset_index(drop=True)
top_matches = top_matches.drop(top_matches.index[0])
st.dataframe(top_matches.head(10))
|