Spaces:

bacancydataprophets
/

Skin_Care_Product_Recommender

Sleeping

File size: 4,388 Bytes

8a65e53

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE 
from scipy.spatial.distance import cdist

st.markdown('<style>h1{color: white;}</style>', unsafe_allow_html=True)
st.title('Call on Doc Skin Care Product Recommender')
st.write('Find the Right Skin Care for you')

st.write("Hi there! If you have a skincare product you currently like I can help you find a similar one based on the ingredients.")

st.write('Please select a product below so I can recommend similar ones')
# Load the data
df = pd.read_csv("./data/cosmetics.csv")

# Choose a product category
category = st.selectbox(label='Select a product category', options= df['Label'].unique() )
category_subset = df[df['Label'] == category]
# Choose a brand
brand = st.selectbox(label='Select a brand', options= sorted(category_subset['Brand'].unique()))
category_brand_subset = category_subset[category_subset['Brand'] == brand]
# Choose product
product = st.selectbox(label='Select the product', options= sorted(category_brand_subset['Name'].unique() ))

#skin_type = st.selectbox(label='Select your skin type', options= ['Combination',
#       'Dry', 'Normal', 'Oily', 'Sensitive'] )

## Helper functions
# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]


if category is not None:
    category_subset = df[df['Label'] == category]

if product is not None:
    #skincare_type = category_subset[category_subset[str(skin_type)] == 1]

    # Reset index
    category_subset = category_subset.reset_index(drop=True)

    # Display data frame
    #st.dataframe(category_subset)

    # Initialize dictionary, list, and initial index
    ingredient_idx = {}
    corpus = []
    idx = 0

    # For loop for tokenization
    for i in range(len(category_subset)):    
        ingredients = category_subset['Ingredients'][i]
        ingredients_lower = ingredients.lower()
        tokens = ingredients_lower.split(', ')
        corpus.append(tokens)
        for ingredient in tokens:
            if ingredient not in ingredient_idx:
                ingredient_idx[ingredient] = idx
                idx += 1

                
    # Get the number of items and tokens 
    M = len(category_subset)
    N = len(ingredient_idx)

    # Initialize a matrix of zeros
    A = np.zeros((M,N))

    # Make a document-term matrix
    i = 0
    for tokens in corpus:
        A[i, :] = oh_encoder(tokens)
        i +=1

model_run = st.button('Find similar products!')


if model_run:

    st.write('Based on the ingredients of the product you selected')
    st.write('here are the top 10 products that are the most similar :sparkles:')
    
    # Run the model
    model = TSNE(n_components = 2, learning_rate = 150, random_state = 42)
    tsne_features = model.fit_transform(A)

    # Make X, Y columns 
    category_subset['X'] = tsne_features[:, 0]
    category_subset['Y'] = tsne_features[:, 1]

    target = category_subset[category_subset['Name'] == product]

    target_x = target['X'].values[0]
    target_y = target['Y'].values[0]

    df1 = pd.DataFrame()
    df1['point'] = [(x, y) for x,y in zip(category_subset['X'], category_subset['Y'])]

    category_subset['distance'] = [cdist(np.array([[target_x,target_y]]), np.array([product]), metric='euclidean') for product in df1['point']]

    # arrange by descending order
    top_matches = category_subset.sort_values(by=['distance'])

    # Compute ingredients in common
    target_ingredients = target.Ingredients.values
    c1_list = target_ingredients[0].split(",")
    c1_list = [x.strip(' ') for x in c1_list]
    c1_set = set(c1_list)

    top_matches['Ingredients in common'] = [c1_set.intersection( set([x.strip(' ')for x in product.split(",")]) ) for product in top_matches['Ingredients']]

    # Select relevant columns
    top_matches = top_matches[['Label', 'Brand', 'Name', 'Price', 'Ingredients','Ingredients in common']]
    top_matches = top_matches.reset_index(drop=True)
    top_matches = top_matches.drop(top_matches.index[0])

    st.dataframe(top_matches.head(10))