import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
from Levenshtein import distance as levenshtein_distance
from scipy.sparse import csr_matrix, coo_matrix
import kagglehub
"""## **Amazon Apparels Data dataset**
The Amazon Apparels Data dataset on Kaggle contains product information scraped from Amazon, focusing on clothing items. It includes two CSV files: styles.csv, which holds product metadata such as brand, color, product type, and image ID; and ratings.csv, which contains customer review data including rating scores and timestamps. The styles.csv file enables analysis of product attributes and visual features, while the ratings.csv file supports sentiment analysis and product quality assessment. This dataset is useful for building recommendation systems, performing style/popularity analysis, and exploring brand-level trends. Together, the two files allow users to connect visual and categorical product features with real-world user feedback.
## Data Preperation
"""
# Download the entire dataset first
path = kagglehub.dataset_download("thekenjin/amazonapparelsdata")
# Find the 'Clothing-Reviews.csv' file within the downloaded dataset
csv_file = os.path.join(path, "Clothing-Reviews.csv")
# Check if the file exists and load it into Pandas
if os.path.exists(csv_file):
reviews = pd.read_csv(csv_file)
print("Dataset loaded successfully!")
print(reviews.head()) # Display the first few rows
else:
print("Clothing-Reviews.csv not found in the downloaded dataset.")
# Download the entire dataset first
path = kagglehub.dataset_download("thekenjin/amazonapparelsdata")
# Find the 'Clothing-Reviews.csv' file within the downloaded dataset
csv_file = os.path.join(path, "Amazon-clothing-info.csv")
# Check if the file exists and load it into Pandas
if os.path.exists(csv_file):
metadata = pd.read_csv(csv_file)
print("Dataset loaded successfully!")
print(metadata.head()) # Display the first few rows
else:
print("Clothing-Reviews.csv not found in the downloaded dataset.")
"""### Basic Info and Statistics"""
# Display basic info
print("Metadata Info:")
print(metadata.info())
print("\nReviews Info:")
print(reviews.info())
# Basic statistics for numerical columns
print("\nMetadata Statistics:")
print(metadata.describe())
print("\nReviews Statistics:")
print(reviews.describe())
"""### Missing Values Analysis"""
# Percentage of missing values
print("\nMetadata Missing Values (%):")
print(metadata.isnull().mean() * 100)
print("\nReviews Missing Values (%):")
print(reviews.isnull().mean() * 100)
# Distribution of review scores
plt.figure(figsize=(8, 5))
sns.countplot(x='review_score', data=reviews)
plt.title('Distribution of Review Scores')
plt.xlabel('Rating (1-5)')
plt.ylabel('Count')
plt.show()
# Calculate percentage distribution
rating_dist = reviews['review_score'].value_counts(normalize=True) * 100
print("\nReview Score Distribution (%):")
print(rating_dist)
# Top colors in the dataset
top_colors = metadata['color'].value_counts().head(10)
plt.figure(figsize=(10, 6))
top_colors.plot(kind='bar')
plt.title('Top 10 Most Common Colors')
plt.xlabel('Color')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
# Add review length as a new feature
reviews['review_length'] = reviews['review_text'].str.len()
# Relationship between review length and rating
plt.figure(figsize=(10, 6))
sns.boxplot(x='review_score', y='review_length', data=reviews)
plt.title('Review Length by Rating')
plt.xlabel('Rating')
plt.ylabel('Review Length (characters)')
plt.show()
"""### **Metadata Summary (28,395 products)**
- **Key Columns**: ASIN (unique), product type, price, brand, color, availability, title, images (URLs), reviews (boolean/URL)
- **Notable Observations**:
- **High Sparsity**: Many columns have significant missing values (e.g., `sku` has only 134 values, `author` has 1).
- **Product Types**: 57 unique types, with "SHIRT" being the most frequent (21,513 entries).
- **Pricing**: Most common price is `$19.99` (945 entries).
- **Brands**: 3,640 unique brands; "TOOGOO(R)" is the most frequent (177 entries).
- **Colors**: "Black" is the top color (5,181 entries).
- **Availability**: Most items ship in "1-2 business days" (12,252 entries).
- **Images**: 19,865 unique image URLs, with some duplicates (e.g., one URL appears 21 times).
### **Reviews Summary (50,046 reviews)**
- **Key Columns**: ASIN (linked to metadata), review score (1–5), title, user ID, summary, text.
- **Statistics**:
- **Scores**: Highly positive (mean = 4.12, median = 5). 75% of reviews are 5-star.
- **Missing Data**: `review_summary` has 2,892 nulls; other columns are complete.
- **Volume**: Reviews are linked to products via ASIN, but metadata has fewer unique ASINs (28,395 vs. 50,046 reviews), suggesting some products have multiple reviews.
### **Notes**:
1. **Data Quality Issues**:
- Metadata has sparse columns (`sku`, `author`, `publisher`, `editorial_review`).
- Possible typo in `editorial_reivew` (misspelled column name).
2. **Product Focus**:
- Dominated by apparel (e.g., "SHIRT"), with Black as the top color.
3. **Reviews**:
- Skewed toward high ratings (potential bias in sentiment analysis).
- Some products likely have many reviews (ASINs reused in reviews dataset).
### Data cleaning
"""
def clean_metadata(metadata):
# Create a clean copy
df = metadata.copy()
# Handle missing values for key categorical features
df['color'] = df['color'].fillna('Unknown')
df['brand'] = df['brand'].fillna('Unbranded')
df['sku'] = df['sku'].fillna('Missing_SKU')
# Convert price safely with multiple fallbacks
def safe_price_convert(price_str):
if pd.isna(price_str):
return np.nan
try:
# Handle "Too low to display" and similar cases
if isinstance(price_str, str):
if any(phrase in price_str.lower() for phrase in ['too low', 'na', 'not available']):
return np.nan
# Extract first price if multiple exist
price_match = re.search(r'\$\d+\.?\d*', price_str)
if price_match:
return float(price_match.group(0).replace('$', ''))
return float(price_str)
except (ValueError, TypeError):
return np.nan
df['price'] = df['formatted_price'].apply(safe_price_convert)
# Impute missing prices with median by product type
df['price'] = df.groupby('product_type_name')['price'].transform(
lambda x: x.fillna(x.median()))
# If still missing, use overall median
df['price'] = df['price'].fillna(df['price'].median())
# Clean and extract shipping information
def extract_shipping_info(avail_str):
if pd.isna(avail_str):
return 7, False # Default 7 days, not in stock
avail_str = str(avail_str).lower()
if 'now' in avail_str:
return 0, True
numbers = re.findall(r'\d+', avail_str)
days = int(numbers[0]) if numbers else 7
in_stock = 'out of stock' not in avail_str
return days, in_stock
shipping_results = df['availability'].apply(extract_shipping_info)
df['shipping_days'] = shipping_results.apply(lambda x: x[0])
df['in_stock'] = shipping_results.apply(lambda x: x[1])
# Clean image URLs - keep only first if multiple exist
for img_col in ['large_image_url', 'medium_image_url', 'small_image_url']:
if img_col in df.columns:
df[img_col] = df[img_col].str.split(',').str[0]
# Drop unnecessary columns
cols_to_drop = ['formatted_price', 'availability', 'availability_type',
'editorial_review', 'editorial_reivew', 'publisher', 'author',
'reviews'] # These columns have many missing values
df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)
# Final missing value check
print("Remaining missing values after cleaning:")
print(df.isna().sum())
return df
cleaned_metadata = clean_metadata(metadata)
import os
os.makedirs("assets", exist_ok=True)
# Assuming `metadata` was already loaded above
cleaned_metadata = clean_metadata(metadata)
# Save the cleaned file
cleaned_metadata.to_csv("assets/cleaned_metadata.csv", index=False)
def clean_reviews(reviews):
df = reviews.copy()
# Ensure critical columns exist
required_cols = ['asin', 'review_userId', 'review_score', 'review_text']
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Required column {col} missing from reviews data")
# Clean score - convert to numeric, handle outliers
df['review_score'] = pd.to_numeric(df['review_score'], errors='coerce')
df = df[df['review_score'].between(1, 5, inclusive='both')]
# Handle missing text
df['review_text'] = df['review_text'].fillna('')
df['review_summary'] = df['review_summary'].fillna('')
# Create helpful features
df['review_length'] = df['review_text'].str.len()
df['has_summary'] = df['review_summary'].str.len() > 0
# Convert date if available (example column)
if 'review_date' in df.columns:
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')
# Clean user IDs
df['review_userId'] = df['review_userId'].fillna('Anonymous')
df['review_userId'] = df['review_userId'].str.strip()
# Drop completely empty columns
df = df.dropna(axis=1, how='all')
print("\nReviews missing values after cleaning:")
print(df.isna().sum())
return df
cleaned_reviews = clean_reviews(reviews)
"""## Study 1 – Similarity measures"""
# Merge data
products = pd.merge(
cleaned_metadata,
cleaned_reviews.groupby('asin').agg({
'review_score': 'mean',
'review_text': lambda x: ' '.join(x)
}).reset_index(),
on='asin',
how='left'
)
# Sample 100 products for demonstration
sample_products = products.dropna().sample(100, random_state=42).copy()
sample_products
"""### Brand Similarity (Jaccard)
The algorithm uses Jaccard similarity to measure brand similarity by comparing word overlap between brands. It ranks products based on how many brand words match relative to the total unique words.
"""
def jaccard_similarity(query_brand, comparison_set):
query_set = set(query_brand.split())
similarity_scores = []
for brand in comparison_set:
if pd.isna(brand):
similarity_scores.append(0)
continue
comp_set = set(str(brand).split())
intersection = query_set.intersection(comp_set)
union = query_set.union(comp_set)
similarity_scores.append(len(intersection)/len(union) if union else 0)
return similarity_scores
def similar_brands(query_product_id, top_n=10):
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0]
query_brand = query_product['brand']
sample_products['brand_similarity'] = jaccard_similarity(
query_brand,
sample_products['brand']
)
return sample_products.sort_values('brand_similarity', ascending=False)[[
'asin', 'title', 'brand', 'price', 'review_score', 'brand_similarity'
]].head(top_n)
"""### Price Similarity (Euclidean)
This algorithm calculates price similarity using inverse absolute difference, giving higher scores to prices closer to the query product's price. It ranks products based on how near their prices are to the target price.
"""
def price_similarity(query_price, comparison_prices):
return 1 / (1 + np.abs(query_price - comparison_prices))
def similar_prices(query_product_id, top_n=10):
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0]
query_price = query_product['price']
sample_products['price_similarity'] = price_similarity(
query_price,
sample_products['price']
)
return sample_products.sort_values('price_similarity', ascending=False)[[
'asin', 'title', 'brand', 'price', 'review_score', 'price_similarity'
]].head(top_n)
"""### Color Similarity (Hamming)
This algorithm uses normalized Hamming distance to compare color strings, measuring character-level similarity. It ranks products based on how closely their color descriptions match the query product's color.
"""
def hamming_similarity(query_color, comparison_colors):
query_color = str(query_color).lower()
similarity_scores = []
for color in comparison_colors:
color = str(color).lower()
max_len = max(len(query_color), len(color))
if max_len == 0:
similarity_scores.append(0)
continue
distance = sum(c1 != c2 for c1, c2 in zip(query_color.ljust(max_len), color.ljust(max_len)))
similarity_scores.append(1 - distance/max_len)
return similarity_scores
def similar_colors(query_product_id, top_n=10):
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0]
query_color = query_product['color']
sample_products['color_similarity'] = hamming_similarity(
query_color,
sample_products['color']
)
return sample_products.sort_values('color_similarity', ascending=False)[[
'asin', 'title', 'color', 'brand', 'review_score', 'color_similarity'
]].head(top_n)
"""### Title Similarity (Levenshtein)
his algorithm uses Levenshtein distance to measure title similarity by counting character insertions, deletions, or substitutions needed to match strings. It ranks products based on how closely their titles resemble the query product's title, normalized by length.
"""
def title_similarity(query_title, comparison_titles):
query_title = str(query_title).lower()
similarity_scores = []
for title in comparison_titles:
title = str(title).lower()
max_len = max(len(query_title), len(title))
if max_len == 0:
similarity_scores.append(0)
continue
distance = levenshtein_distance(query_title, title)
similarity_scores.append(1 - distance/max_len)
return similarity_scores
def similar_titles(query_product_id, top_n=10):
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0]
query_title = query_product['title']
sample_products['title_similarity'] = title_similarity(
query_title,
sample_products['title']
)
return sample_products.sort_values('title_similarity', ascending=False)[[
'asin', 'title', 'brand', 'price', 'review_score', 'title_similarity'
]].head(top_n)
"""### Review Text Similarity (TF-IDF Cosine)
This algorithm uses TF-IDF vectorization and cosine similarity to compare review texts, identifying products with semantically similar reviews. It ranks products based on how closely their review content matches the query product's review in terms of key terms and their importance.
"""
def prepare_tfidf_matrix(texts):
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
return tfidf.fit_transform(texts.fillna(''))
def review_similarity(query_text, tfidf_matrix, index):
query_vec = tfidf.transform([query_text])
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
return similarities
def similar_reviews(query_product_id, top_n=10):
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0]
query_text = query_product['review_text']
# Prepare TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(sample_products['review_text'].fillna(''))
# Transform query
query_vec = tfidf.transform([query_text])
# Calculate similarities
sample_products['review_similarity'] = cosine_similarity(
query_vec,
tfidf_matrix
).flatten()
return sample_products.sort_values('review_similarity', ascending=False)[[
'asin', 'title', 'brand', 'review_score', 'review_similarity'
]].head(top_n)
"""### Simulation Requests"""
# Get random product IDs for our queries
query_products = sample_products.sample(5, random_state=42)
print("=== Similarity Study Results ===")
# 1. Show me products with similar brands to [product]
product_id = query_products.iloc[0]['asin']
print(f"\n1. Products with similar brands to '{query_products.iloc[0]['title']}':")
#display(similar_brands(product_id))
# 2. Show me products with similar prices to [product]
product_id = query_products.iloc[1]['asin']
print(f"\n2. Products with similar prices to '{query_products.iloc[1]['title']}':")
#display(similar_prices(product_id))
# 3. Show me products with similar colors to [product]
product_id = query_products.iloc[2]['asin']
print(f"\n3. Products with similar colors to '{query_products.iloc[2]['title']}':")
#display(similar_colors(product_id))
# 4. Show me products with similar titles to [product]
product_id = query_products.iloc[3]['asin']
print(f"\n4. Products with similar titles to '{query_products.iloc[3]['title']}':")
#display(similar_titles(product_id))
# 5. Show me products with similar reviews to [product]
product_id = query_products.iloc[4]['asin']
print(f"\n5. Products with similar reviews to '{query_products.iloc[4]['title']}':")
#display(similar_reviews(product_id))
"""#### Analysis of Similarity Study Results
**1. Brand Similarity**
The Jaccard similarity successfully identified exact brand matches (100% similarity for "Jonathan Corey") while correctly giving 0 similarity to unrelated brands. However, it fails to detect potential partial matches or parent/subsidiary brand relationships that might be helpful.
**2. Price Similarity**
The inverse distance method effectively clustered products within a small price range (±$1), with similarity scores dropping rapidly beyond that threshold. This works well for exact price matches but could be inproved from logarithmic scaling for broader price categories.
**3. Color Similarity**
The Hamming distance approach perfectly matched identical color descriptions ("Light Blue"), but showed limitations with:
- Only 50% similarity for colors containing the same root word ("Night" vs "Night 846")
- Poor handling of conceptually similar but lexically different colors (e.g., "Pink" vs "Light Blue")
**4. Title Similarity**
Levenshtein distance achieved perfect matching for identical listings while effectively ranking:
- Partial matches with shared keywords ("blouse", "shoulder")
- Same product in different colors/sizes
- Gradually decreasing scores for more distant product types
**5. Review Similarity**
The TF-IDF/cosine similarity approach showed:
- Perfect match for identical products
- Moderate similarity for reviews mentioning fabric types ("silk", "pique")
- Low but non-zero scores for unrelated products, suggesting the vector space can capture some latent semantic relationships
## Study 2 – Clustering algorithms
"""
# Merge with review scores
products = pd.merge(
cleaned_metadata,
cleaned_reviews.groupby('asin')['review_score'].mean().reset_index(),
on='asin',
how='left'
)
# Calculate popularity metrics
products['brand_popularity'] = products.groupby('brand')['brand'].transform('count')
products['color_popularity'] = products.groupby('color')['color'].transform('count')
# Select relevant columns and drop missing values
cluster_data = products[['price', 'review_score', 'brand_popularity', 'color_popularity']].dropna()
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)
# KMeans with different k values
plt.figure(figsize=(15, 5))
# k=3
plt.subplot(1, 2, 1)
kmeans3 = KMeans(n_clusters=3, random_state=42)
clusters = kmeans3.fit_predict(scaled_data[:, [0, 1]]) # Price and Review Score
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis')
plt.title('KMeans (k=3) - Price vs Review Score')
plt.xlabel('Price')
plt.ylabel('Review Score')
# k=5
plt.subplot(1, 2, 2)
kmeans5 = KMeans(n_clusters=5, random_state=42)
clusters = kmeans5.fit_predict(scaled_data[:, [0, 1]])
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis')
plt.title('KMeans (k=5) - Price vs Review Score')
plt.xlabel('Price')
plt.tight_layout()
plt.show()
"""
The scatter plots above visualize the results of KMeans clustering applied to a dataset with two features: **Price** (x-axis) and **Review Score** (y-axis). Two clustering configurations are compared: `k=3` and `k=5`.
#### 1. **Customer Segmentation Patterns**
- A large concentration of data points exists in the **low-price, high-review** segment. This suggests that a significant portion of items (e.g., products, services) are **affordable and well-rated**, potentially indicating strong value for money.
- Conversely, there are items with **low prices but low review scores**, representing a segment of potentially low-quality, budget offerings.
- High-priced items are relatively rare and scattered across different review scores, indicating **price alone does not guarantee high customer satisfaction**.
#### 2. **Non-Linear Relationship Between Price and Quality**
- The clusters do not align along a simple trend line (e.g., increasing price with increasing review score). Instead, they show that **review score and price are not strongly correlated**.
- This suggests that **higher prices do not consistently lead to better reviews**, and customers may find high value in lower-priced items.
#### 3. **Discrete Review Score Distribution**
- The presence of clear horizontal bands in both plots implies that review scores are **categorical or ordinal** rather than continuous. This may reflect the use of a **fixed-scale rating system** (e.g., 1 to 5 stars), which influences how clustering interprets vertical groupings.
#### 4. **Effect of Varying k**
- With `k=3`, the model captures **general segments**: (a) low price–low rating, (b) low price–high rating, and (c) higher price products.
- With `k=5`, the model reveals **finer-grained distinctions** within those segments. For example, it identifies subgroups within the high review score group that differ by price, and isolates extreme outliers (e.g., high price–low rating).
#### 5. **Presence of Outliers**
- A small number of points are priced extremely high (e.g., above $400). These do not form large clusters, suggesting they are **niche offerings** or **premium products** that deviate from the rest of the dataset.
- These outliers could impact clustering quality if not treated or analyzed separately.
Overall, the clustering analysis reveals that:
- **Most offerings are low-priced and highly rated**, indicating a market saturated with strong low-cost options.
- **Price is not a reliable indicator of review score**, so other features (e.g., brand, product category, location) may be important to model user satisfaction.
- The data supports both general segmentation (k=3) and more refined subgroup analysis (k=5), depending on the use case.
"""
# DBSCAN with different parameters
plt.figure(figsize=(15, 5))
# eps=0.5, min_samples=5
plt.subplot(1, 2, 1)
dbscan1 = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan1.fit_predict(scaled_data[:, [0, 1]])
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis')
plt.title('DBSCAN (eps=0.5, min=5) - Price vs Review Score')
plt.xlabel('Price')
plt.ylabel('Review Score')
# eps=1.0, min_samples=10
plt.subplot(1, 2, 2)
dbscan2 = DBSCAN(eps=1.0, min_samples=10)
clusters = dbscan2.fit_predict(scaled_data[:, [0, 1]])
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis')
plt.title('DBSCAN (eps=1.0, min=10) - Price vs Review Score')
plt.xlabel('Price')
plt.tight_layout()
plt.show()
"""
The plots above show clustering results using **DBSCAN (Density-Based Spatial Clustering of Applications with Noise)**. The two plots compare results using different parameter configurations for price and review:
#### Left Plot: DBSCAN (eps = 0.5, min_samples = 5)
- The majority of the data points are grouped into **a single dense cluster**, primarily composed of **low to mid-price, high-review score** entries.
- A few **small, separate clusters** form among high-priced items with high review scores.
- A significant number of **points are labeled as noise** (purple), especially in **higher price regions** or in **sparse review score segments**.
- This configuration captures local groupings well but is **over-labelling sparse but valid data as noise**, especially outliers in pricing.
#### Right Plot: DBSCAN (eps = 1.0, min_samples = 10)
- A **single dominant cluster** is formed that captures almost all data points, regardless of price or review score.
- Very few points are labeled as noise, indicating that the increased radius (`eps`) and required density (`min_samples`) lead to a **broader, less selective clustering**.
- This setting likely **masks meaningful substructure** in the data by treating almost everything as part of one large group.
- The lack of distinct clusters suggests the data may be **globally dense but lacks local density variation** that DBSCAN can exploits.
### What These Results Reveal About the Data
#### 1. **High-Density Core Around Low Price & High Review**
- Both parameter settings highlight a **dense cluster** of items with **low prices and high review scores**, confirming a dominant segment in the data where products are affordable and well-reviewed.
- This suggests that in this market, **value products dominate**, and this group may reflect a competitive or saturated segment.
#### 2. **Sparse Distribution of High-Price Items**
- High-priced items are **sparsely distributed** across the dataset and are often marked as **outliers** by DBSCAN.
- This implies that **premium offerings are rare and isolated**, possibly catering to niche customer bases.
#### 3. **Review Scores Are Not Density Drivers**
- Despite the discrete nature of review scores, DBSCAN does not form distinct horizontal clusters along review bands.
- This suggests that **review score alone does not define local density**, and DBSCAN prioritizes **price variations** more heavily in this context.
Overall, the DBscan algorithim reveals:
- The dataset has one dominant group of **low-cost, high-review** items.
- High-priced items are generally **isolated and not naturally clusterable**, as shown by their frequent classification as outliers.
- DBSCAN is useful here for **identifying core clusters and outliers**, but it struggles to find multiple natural groupings unless there’s strong local density variation.
- **Alternative clustering algorithms (e.g., KMeans or hierarchical clustering)** may be more appropriate when the goal is to **partition the dataset into interpretable segments**.
"""
# KMeans with different k values
plt.figure(figsize=(15, 5))
# k=4
plt.subplot(1, 2, 1)
kmeans4 = KMeans(n_clusters=4, random_state=42)
clusters = kmeans4.fit_predict(scaled_data[:, [2, 3]]) # Brand and Color Popularity
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma')
plt.title('KMeans (k=4) - Brand vs Color Popularity')
plt.xlabel('Brand Popularity')
plt.ylabel('Color Popularity')
# k=6
plt.subplot(1, 2, 2)
kmeans6 = KMeans(n_clusters=6, random_state=42)
clusters = kmeans6.fit_predict(scaled_data[:, [2, 3]])
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma')
plt.title('KMeans (k=6) - Brand vs Color Popularity')
plt.xlabel('Brand Popularity')
plt.tight_layout()
plt.show()
"""
The scatter plots above visualize the results of KMeans clustering applied to a dataset with two features: **Brand Popularity** (x-axis) and **Color Popularity** (y-axis). Two clustering configurations are compared: `k=4` and `k=6`.
#### General Observations:
- Data points are concentrated in **three distinct horizontal bands** on the y-axis (Color Popularity): near 0, ~3000, and ~5000.
- This suggests that color popularity has **three major popularity levels**, potentially indicating fixed tiers (e.g., low, medium, high demand for certain color groups).
- Brand popularity is more **evenly distributed** and continuous, with values ranging from 0 to above 120.
#### KMeans with k = 4:
- The model identifies clusters based primarily on **Brand Popularity** within each Color Popularity tier.
- In the lowest Color Popularity tier (y ≈ 0–1000), three clusters are visible based on varying levels of Brand Popularity.
- High Color Popularity bands (y ≈ 3000 and 5000) are grouped more broadly, suggesting less differentiation in Brand Popularity at those levels.
- This implies that **color popularity may dominate** the cluster formation, especially in higher tiers.
#### KMeans with k = 6:
- Increasing k introduces **finer distinctions** within the bands:
- The low Color Popularity band is now split into multiple clusters across Brand Popularity.
- The mid and high Color Popularity bands are also divided, allowing segmentation based on brand performance even among highly popular colors.
- This configuration reveals more **detailed market segmentation**, showing that even within popular color categories, **brand popularity can vary significantly**.
### What These Results Reveal About the Data
#### 1. **Color Popularity Exhibits Discrete Behavior**
- The presence of horizontal bands strongly suggests that Color Popularity is either **categorical** or derived from fixed-count groupings.
- This could mean colors are grouped by tiers (e.g., top 10 most popular, moderate interest, niche colors).
#### 2. **Brand Popularity Drives Subgroup Variation**
- Within each color tier, Brand Popularity introduces meaningful variation.
- Clusters along the x-axis (Brand Popularity) reflect **differentiation between strong, mid-tier, and weak brands** within each color group.
#### 3. **Increasing k Uncovers Latent Substructure**
- The shift from 4 to 6 clusters shows that the dataset supports **finer segmentation**, particularly helpful for identifying micro-markets or personalized targeting.
- This may be useful in **marketing or inventory strategies**, such as deciding which color-brand combinations to promote or produce more of.
#### 4. **Low Color Popularity Shows Greatest Brand Spread**
- The widest spread of Brand Popularity is found in the lowest color tier (bottom band), indicating that **brands try to differentiate themselves in less popular color spaces**.
- This could suggest **brand experimentation or niche targeting** in low-demand color segments.
Overall, the clustering analysis reveals that:
- **Color Popularity appears categorical or tiered**, heavily influencing the overall clustering structure.
- **Brand Popularity introduces meaningful variation within each color tier**, especially in the low and mid segments.
- Using a higher value of `k` (e.g., 6) captures more detailed behavior patterns, allowing **richer segmentation** for strategic decision-making in areas like marketing, design, or inventory management.
"""
# DBSCAN with different parameters
plt.figure(figsize=(15, 5))
# eps=0.3, min_samples=5
plt.subplot(1, 2, 1)
dbscan3 = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan3.fit_predict(scaled_data[:, [2, 3]])
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma')
plt.title('DBSCAN (eps=0.3, min=5) - Brand vs Color Popularity')
plt.xlabel('Brand Popularity')
plt.ylabel('Color Popularity')
# eps=0.5, min_samples=10
plt.subplot(1, 2, 2)
dbscan4 = DBSCAN(eps=0.5, min_samples=10)
clusters = dbscan4.fit_predict(scaled_data[:, [2, 3]])
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma')
plt.title('DBSCAN (eps=0.5, min=10) - Brand vs Color Popularity')
plt.xlabel('Brand Popularity')
plt.tight_layout()
plt.show()
"""The above scatter plots display clustering results using the DBSCAN algorithm on features **Brand Popularity** (x-axis) and **Color Popularity** (y-axis). Two parameter configurations are compared to understand how density-based clustering interacts with the structure of the data.
#### Left Plot: DBSCAN (eps = 0.3, min_samples = 5)
- **Three main clusters** are detected, corresponding almost exactly to three distinct horizontal bands in color popularity (~0–1000, ~3000, ~5000).
- DBSCAN is able to **clearly separate color popularity tiers** due to their high intra-group density and large inter-group spacing.
- The clustering shows minimal segmentation based on brand popularity within each band.
- This suggests that, under tighter density constraints, **Color Popularity dominates** as the primary clustering feature.
#### Right Plot: DBSCAN (eps = 0.5, min_samples = 10)
- The broader radius (`eps = 0.5`) and higher density requirement (`min_samples = 10`) result in **three broader clusters**, once again aligned with the three tiers of color popularity.
- More points are included in clusters (fewer outliers), but **brand popularity still plays a minimal role** in segmentation.
- The result reinforces the observation that the data’s natural clustering structure is **horizontally stratified** (based on color tiers), rather than vertically or diagonally segmented by brand.
### What These Results Reveal About the Data
#### 1. **Color Popularity Has Strong Tier-Based Structure**
- DBSCAN easily isolates the three dominant horizontal bands, confirming that Color Popularity follows a **discrete or tiered structure**, likely corresponding to fixed popularity levels or thresholds.
- These bands are consistent across parameter variations, indicating **high local density** within each tier.
#### 2. **Brand Popularity is Continuously Distributed**
- Unlike color popularity, brand popularity shows **no natural density-based groupings** and appears more uniformly spread.
- DBSCAN is unable to meaningfully cluster points based on brand popularity alone, suggesting brand popularity behaves more like a **continuous or evenly distributed variable**.
#### 3. **Clusters are Robust to Parameter Change**
- The overall structure of the clusters is **stable across different DBSCAN configurations**, suggesting that the data has a **strong, intrinsic density-based structure**.
- DBSCAN naturally ignores noise and outliers without forcing cluster assignments, making it well-suited for separating out **distinct, dense subgroups** like the color tiers.
#### 4. **KMeans vs DBSCAN: Key Differences**
- Compared to KMeans, which segmented along both axes, DBSCAN focuses on **density within localized areas**, revealing that only **Color Popularity exhibits true density-based clustering**.
- KMeans may be more useful for **market segmentation** involving continuous brand dynamics, while DBSCAN is better for **identifying fixed or bounded clusters**, such as popularity tiers.
Overall, the DBscan algorithim reveals:
- The data reveals **three strong, dense groupings in Color Popularity**, which DBSCAN captures effectively.
- **Brand Popularity lacks density-based clustering structure**, and is better interpreted as a continuous variable.
- DBSCAN is effective in detecting **tiered categorical-like patterns**, but less useful when segmentation depends on evenly distributed features like brand popularity.
##Study 3 – Content-Based Recommendation System
"""
# Ensure we have the necessary columns
print("Available columns in products DataFrame:")
print(products.columns.tolist())
# Create necessary features if they don't exist
if 'brand_popularity' not in products.columns:
products['brand_popularity'] = products.groupby('brand')['brand'].transform('count')
if 'color_popularity' not in products.columns:
products['color_popularity'] = products.groupby('color')['color'].transform('count')
# Clean and prepare the data
products['price'] = pd.to_numeric(products['price'], errors='coerce')
products['review_score'] = pd.to_numeric(products['review_score'], errors='coerce')
products = products.dropna(subset=['price', 'review_score'])
"""This hybrid recommendation algorithm combines price proximity, review score similarity, and brand similarity into a weighted score to suggest products. It uses inverse distance for price/review comparisons and Jaccard similarity for brand name matching. The final recommendations balance affordability (50% weight), quality (30%), and brand relevance (20%) while excluding the query product itself.
"""
def price_quality_recommendations(query_asin, df=products, top_n=10):
"""Recommend products based on price, review score, and brand similarity"""
query = df[df['asin'] == query_asin].iloc[0]
# Calculate similarities
df['price_sim'] = 1 / (1 + np.abs(df['price'] - query['price']))
df['review_sim'] = 1 / (1 + np.abs(df['review_score'] - query['review_score']))
# Jaccard similarity for brand
query_brand_words = set(str(query['brand']).lower().split())
df['brand_sim'] = df['brand'].apply(
lambda x: len(query_brand_words.intersection(set(str(x).lower().split()))) /
len(query_brand_words.union(set(str(x).lower().split()))) if pd.notna(x) else 0
)
# Combined score (weighted average)
df['pq_score'] = 0.5*df['price_sim'] + 0.3*df['review_sim'] + 0.2*df['brand_sim']
# Return top recommendations (excluding query item)
return df[df['asin'] != query_asin].sort_values('pq_score', ascending=False)[[
'asin', 'title', 'brand', 'price', 'review_score', 'pq_score'
]].head(top_n)
"""This algorithm recommends stylistically similar products by combining title similarity (40% weight using Levenshtein distance), color matching (30% exact match), and brand popularity alignment (30% inverse distance). It identifies items with comparable descriptions, matching colors, and similarly prestigious brands while excluding the query product itself."""
def style_popularity_recommendations(query_asin, df=products, top_n=10):
"""Recommend products based on title similarity, color, and brand popularity"""
query = df[df['asin'] == query_asin].iloc[0]
def style_popularity_recommendations(query_asin, df=products, top_n=10):
"""Recommend products based on title similarity, color, and brand popularity"""
query = df[df['asin'] == query_asin].iloc[0]
# Title similarity (Levenshtein)
query_title = str(query['title']).lower()
df['title_sim'] = df['title'].apply(
lambda x: 1 - (levenshtein_distance(query_title, str(x).lower()) /
max(len(query_title), len(str(x)), 1))
)
# Color similarity (exact match)
query_color = str(query['color']).lower()
df['color_sim'] = (df['color'].str.lower() == query_color).astype(float)
# Brand popularity similarity
df['brand_pop_sim'] = 1 / (1 + np.abs(df['brand_popularity'] - query['brand_popularity']))
# Combined score
df['sp_score'] = 0.4*df['title_sim'] + 0.3*df['color_sim'] + 0.3*df['brand_pop_sim']
# Return top recommendations
return df[df['asin'] != query_asin].sort_values('sp_score', ascending=False)[[
'asin', 'title', 'color', 'brand', 'brand_popularity', 'sp_score'
]].head(top_n)
# Color similarity (exact match)
query_color = str(query['color']).lower()
df['color_sim'] = (df['color'].str.lower() == query_color).astype(float)
# Brand popularity similarity
df['brand_pop_sim'] = 1 / (1 + np.abs(df['brand_popularity'] - query['brand_popularity']))
# Combined score
df['sp_score'] = 0.4*df['title_sim'] + 0.3*df['color_sim'] + 0.3*df['brand_pop_sim']
# Return top recommendations
return df[df['asin'] != query_asin].sort_values('sp_score', ascending=False)[[
'asin', 'title', 'color', 'brand', 'brand_popularity', 'sp_score'
]].head(top_n)
sample_products = products.dropna(subset=['title', 'brand', 'color']).sample(3, random_state=42)
for i, (_, row) in enumerate(sample_products.iterrows(), 1):
print(f"\n=== Query {i} ===")
print(f"Product: {row['title']}")
print(f"Brand: {row['brand']}, Color: {row['color']}")
print(f"Price: ${row['price']:.2f}, Rating: {row['review_score']:.1f}")
print("\nPrice & Quality Recommendations:")
#display(price_quality_recommendations(row['asin']))
print("\nStyle & Popularity Recommendations:")
#display(style_popularity_recommendations(row['asin']))
"""### Analysis of Results
#### 1. Query 1 — *Belle By Badgley Mischka Top ($87.99, Rating: 2.0)*
**Price & Quality Recommendations:**
- Mostly same-brand items or similar premium brands (e.g., Bailey 44, Josie Natori)
- Closely matched prices ($87–$88 range)
- Review scores varied significantly (1.0–5.0), despite the low query rating
- Top match was same brand and price but with a better rating (4.0)
**Style & Popularity Recommendations:**
- Focused on the same brand and other high-end designers
- Similar color palette (e.g., black, coral)
- All from brands with popularity score **9** (highest tier)
- Strong brand consistency but limited style diversity
#### 2. Query 2 — *G.H. Bass Striped Tee ($26.96, Rating: 4.3)*
**Price & Quality Recommendations:**
- Excellent price matching (±$0.25 in top results)
- High and consistent review scores (4.25–4.52)
- Diverse range of styles (e.g., lace, baby doll, athletic)
- Included both casual and dressy options
**Style & Popularity Recommendations:**
- Greater brand diversity compared to Query 1
- Focused on similar striped patterns
- All from **mid-popularity brands** (score 3)
- Color matching was less precise than with premium brand queries
#### 3. Query 3 — *JSDY Bohemian Blouse ($15.88, Rating: 5.0)*
**Price & Quality Recommendations:**
- Near-perfect price matching (±$0.11 in top result)
- All recommended items had perfect 5.0 ratings
- Strong style consistency (bohemian aesthetics)
- Included floral and chiffon designs
**Style & Popularity Recommendations:**
- Excellent color matching (all white tops)
- Featured batwing/flowy sleeves similar to the query
- All from **lower-popularity brands** (score 3)
- More niche/indie labels than in other queries
**Notes**
1. **Price matching** is most effective at **lower price points**
2. Queries with **high-end brands** produce **brand-loyal recommendations**
3. Items with **perfect review scores** receive similarly rated suggestions
4. **Brand popularity scores** establish **clear tier separation**
5. **Pandas warnings** indicate opportunities for **code optimization**
## Study 4 – Collaborative Filtering Recommendation System
"""
# 1. Create utility matrix
print("Creating user-item matrix...")
user_item_matrix = cleaned_reviews.pivot_table(
index='review_userId',
columns='asin',
values='review_score',
fill_value=0
)
# Convert to sparse matrix format
R = csr_matrix(user_item_matrix.values)
user_ids = user_item_matrix.index.values
item_ids = user_item_matrix.columns.values
# 3. Train-test split (90-10)
print("Splitting data into train and test sets...")
train_reviews, test_reviews = train_test_split(
cleaned_reviews,
test_size=0.1,
random_state=42
)
# Create sparse matrices for train and test
def create_sparse_matrix(df, user_ids, item_ids):
"""Create sparse matrix from DataFrame"""
user_mapping = {u: i for i, u in enumerate(user_ids)}
item_mapping = {m: i for i, m in enumerate(item_ids)}
rows = df['review_userId'].map(user_mapping)
cols = df['asin'].map(item_mapping)
values = df['review_score']
return coo_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids)))
R_train = create_sparse_matrix(train_reviews, user_ids, item_ids)
R_test = create_sparse_matrix(test_reviews, user_ids, item_ids)
# Convert to CSR format for efficient operations
R_train = R_train.tocsr()
R_test = R_test.tocsr()
# 4. Matrix Factorization using NMF
print("Performing matrix factorization...")
n_factors = 15
model = NMF(
n_components=n_factors,
init='random',
random_state=42,
max_iter=500
)
P = model.fit_transform(R_train) # User factors (n_users x n_factors)
Q = model.components_ # Item factors (n_factors x n_items)
# 5. Evaluation Functions
def evaluate_predictions(P, Q, test_matrix):
"""Calculate MSE between predicted and actual ratings"""
test_coo = test_matrix.tocoo()
predicted = np.array([P[row].dot(Q[:, col])
for row, col in zip(test_coo.row, test_coo.col)])
actual = test_coo.data
return mean_squared_error(actual, predicted)
# 6. Perform evaluation
print("Evaluating recommendations...")
test_coo = R_test.tocoo()
test_predicted = np.array([P[row].dot(Q[:, col])
for row, col in zip(test_coo.row, test_coo.col)])
test_actual = test_coo.data
mse = evaluate_predictions(P, Q, R_test)
print(f"MSE: {mse:.4f}")
"""This function generates personalized product recommendations for a given user based on matrix factorization (using matrices P and Q). It predicts ratings for unrated items by the user, filters out already rated products, and returns the top-k recommendations with the highest predicted ratings. The output includes product details like title, brand, and price, sorted by predicted rating in descending order. If the user ID isn't found, it returns None with an error message.
"""
# 7. Recommendation Function
def get_recommendations(user_id, P, Q, user_item_matrix, product_data, k=10):
"""Generate top-k recommendations for a user"""
if user_id not in user_ids:
print(f"User {user_id} not found in dataset")
return None
user_idx = np.where(user_ids == user_id)[0][0]
user_ratings = user_item_matrix.values[user_idx]
# Predict ratings for all items
predicted_ratings = P[user_idx].dot(Q)
# Filter out already rated items
unrated_mask = user_ratings == 0
predicted_unrated = predicted_ratings[unrated_mask]
unrated_items = item_ids[unrated_mask]
# Get top-k recommendations
top_indices = np.argsort(predicted_unrated)[-k:][::-1]
recommendations = pd.DataFrame({
'asin': unrated_items[top_indices],
'predicted_rating': predicted_unrated[top_indices]
}).merge(product_data[['asin', 'title', 'brand', 'price']], on='asin')
return recommendations.sort_values('predicted_rating', ascending=False)
# 8. Generate sample recommendations
sample_users = cleaned_reviews['review_userId'].drop_duplicates().sample(3, random_state=42)
for user_id in sample_users:
print(f"\n=== Recommendations for User {user_id} ===")
# Show user's top rated items
user_ratings = cleaned_reviews[cleaned_reviews['review_userId'] == user_id]
top_rated = user_ratings.sort_values('review_score', ascending=False).head(3)
print("\nUser's top rated items:")
#display(top_rated[['asin', 'title', 'review_score']].merge(
#cleaned_metadata[['asin', 'brand', 'price']], on='asin'))
# Generate recommendations
recs = get_recommendations(user_id, P, Q, user_item_matrix, cleaned_metadata)
print("\nTop recommended items:")
#display(recs)
"""#### Analysis of Result
1. User AJU5WLLEHPZ3X
- Profile: Shows preference for affordable women's graphic tees ($5-$10 range)
- Top rated items: Special Olympics and My Chemical Romance themed shirts
- Recommendations:
* Mostly fashion items priced $12-$40
* Includes sportswear (Under Armour tanks)
* Some boho-style items (floral caftan, kimono cover-up)
* Predicted ratings show strong confidence (1.4-2.9 range)
* Price points higher than user's purchase history
2. User A35TTLP3VIFEOB
- Profile: Purchased budget-friendly women's tops and band merch
- Top rated items: Color-block striped top and concert poster shirt
- Recommendations:
* Mix of fashion styles (lace, sheer, striped tops)
* Includes both affordable ($12) and premium ($90) items
* One clearly inappropriate recommendation (adult toy)
* Predicted ratings show moderate confidence (1.0-2.1 range)
* Some brand consistency (Under Armour appears in both history and recs)
3. User A1P9JI4TR933JX
- Profile: Only one rated item (velvet hi-low top)
- Top rated item: 9.49 fashion top
- Recommendations:
* All predictions show zero confidence (0.0 ratings)
* Wide variety of styles with no clear pattern
* Price points vary dramatically ($8-$59)
* Includes some premium brands (Jessica Simpson)
* Demonstrates classic cold-start problem
System Performance Analysis:
- Works best for users with multiple purchases
- Shows some ability to match styles (graphic tees → sportswear)
- Struggles with:
* Price point matching
* Niche preferences
* Users with limited history
* Filtering inappropriate content
## *References*
Collaborative Filtering in Python" – Krish Naik
https://www.youtube.com/watch?v=3ja4C3BWeEg
"Matrix Factorization for Recommendations" – Aladdin Persson
https://www.youtube.com/watch?v=ZspR5PZemcs
"Content-Based RecSys with TF-IDF & Cosine Similarity" – Data Science Dojo
https://www.youtube.com/watch?v=XoTwndOgXBM
"Building a Book Recommender" – James Briggs
https://www.youtube.com/watch?v=1qHStu_pTv8
"""