|
|
|
|
|
import os |
|
import re |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet |
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.svm import SVR |
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score |
|
from sklearn.neighbors import LocalOutlierFactor |
|
from sklearn.cluster import KMeans, DBSCAN |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.decomposition import NMF |
|
|
|
from Levenshtein import distance as levenshtein_distance |
|
from scipy.sparse import csr_matrix, coo_matrix |
|
import kagglehub |
|
|
|
"""## **Amazon Apparels Data dataset** |
|
|
|
The Amazon Apparels Data dataset on Kaggle contains product information scraped from Amazon, focusing on clothing items. It includes two CSV files: styles.csv, which holds product metadata such as brand, color, product type, and image ID; and ratings.csv, which contains customer review data including rating scores and timestamps. The styles.csv file enables analysis of product attributes and visual features, while the ratings.csv file supports sentiment analysis and product quality assessment. This dataset is useful for building recommendation systems, performing style/popularity analysis, and exploring brand-level trends. Together, the two files allow users to connect visual and categorical product features with real-world user feedback. |
|
|
|
## Data Preperation |
|
""" |
|
|
|
|
|
path = kagglehub.dataset_download("thekenjin/amazonapparelsdata") |
|
|
|
|
|
csv_file = os.path.join(path, "Clothing-Reviews.csv") |
|
|
|
|
|
if os.path.exists(csv_file): |
|
reviews = pd.read_csv(csv_file) |
|
print("Dataset loaded successfully!") |
|
print(reviews.head()) |
|
else: |
|
print("Clothing-Reviews.csv not found in the downloaded dataset.") |
|
|
|
|
|
path = kagglehub.dataset_download("thekenjin/amazonapparelsdata") |
|
|
|
|
|
csv_file = os.path.join(path, "Amazon-clothing-info.csv") |
|
|
|
|
|
if os.path.exists(csv_file): |
|
metadata = pd.read_csv(csv_file) |
|
print("Dataset loaded successfully!") |
|
print(metadata.head()) |
|
else: |
|
print("Clothing-Reviews.csv not found in the downloaded dataset.") |
|
|
|
"""### Basic Info and Statistics""" |
|
|
|
|
|
print("Metadata Info:") |
|
print(metadata.info()) |
|
print("\nReviews Info:") |
|
print(reviews.info()) |
|
|
|
|
|
print("\nMetadata Statistics:") |
|
print(metadata.describe()) |
|
print("\nReviews Statistics:") |
|
print(reviews.describe()) |
|
|
|
"""### Missing Values Analysis""" |
|
|
|
|
|
print("\nMetadata Missing Values (%):") |
|
print(metadata.isnull().mean() * 100) |
|
print("\nReviews Missing Values (%):") |
|
print(reviews.isnull().mean() * 100) |
|
|
|
|
|
plt.figure(figsize=(8, 5)) |
|
sns.countplot(x='review_score', data=reviews) |
|
plt.title('Distribution of Review Scores') |
|
plt.xlabel('Rating (1-5)') |
|
plt.ylabel('Count') |
|
plt.show() |
|
|
|
|
|
rating_dist = reviews['review_score'].value_counts(normalize=True) * 100 |
|
print("\nReview Score Distribution (%):") |
|
print(rating_dist) |
|
|
|
|
|
top_colors = metadata['color'].value_counts().head(10) |
|
plt.figure(figsize=(10, 6)) |
|
top_colors.plot(kind='bar') |
|
plt.title('Top 10 Most Common Colors') |
|
plt.xlabel('Color') |
|
plt.ylabel('Count') |
|
plt.xticks(rotation=45) |
|
plt.show() |
|
|
|
|
|
reviews['review_length'] = reviews['review_text'].str.len() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.boxplot(x='review_score', y='review_length', data=reviews) |
|
plt.title('Review Length by Rating') |
|
plt.xlabel('Rating') |
|
plt.ylabel('Review Length (characters)') |
|
plt.show() |
|
|
|
"""### **Metadata Summary (28,395 products)** |
|
- **Key Columns**: ASIN (unique), product type, price, brand, color, availability, title, images (URLs), reviews (boolean/URL) |
|
- **Notable Observations**: |
|
- **High Sparsity**: Many columns have significant missing values (e.g., `sku` has only 134 values, `author` has 1). |
|
- **Product Types**: 57 unique types, with "SHIRT" being the most frequent (21,513 entries). |
|
- **Pricing**: Most common price is `$19.99` (945 entries). |
|
- **Brands**: 3,640 unique brands; "TOOGOO(R)" is the most frequent (177 entries). |
|
- **Colors**: "Black" is the top color (5,181 entries). |
|
- **Availability**: Most items ship in "1-2 business days" (12,252 entries). |
|
- **Images**: 19,865 unique image URLs, with some duplicates (e.g., one URL appears 21 times). |
|
|
|
### **Reviews Summary (50,046 reviews)** |
|
- **Key Columns**: ASIN (linked to metadata), review score (1β5), title, user ID, summary, text. |
|
- **Statistics**: |
|
- **Scores**: Highly positive (mean = 4.12, median = 5). 75% of reviews are 5-star. |
|
- **Missing Data**: `review_summary` has 2,892 nulls; other columns are complete. |
|
- **Volume**: Reviews are linked to products via ASIN, but metadata has fewer unique ASINs (28,395 vs. 50,046 reviews), suggesting some products have multiple reviews. |
|
|
|
### **Notes**: |
|
1. **Data Quality Issues**: |
|
- Metadata has sparse columns (`sku`, `author`, `publisher`, `editorial_review`). |
|
- Possible typo in `editorial_reivew` (misspelled column name). |
|
2. **Product Focus**: |
|
- Dominated by apparel (e.g., "SHIRT"), with Black as the top color. |
|
3. **Reviews**: |
|
- Skewed toward high ratings (potential bias in sentiment analysis). |
|
- Some products likely have many reviews (ASINs reused in reviews dataset). |
|
|
|
### Data cleaning |
|
""" |
|
|
|
def clean_metadata(metadata): |
|
|
|
df = metadata.copy() |
|
|
|
|
|
df['color'] = df['color'].fillna('Unknown') |
|
df['brand'] = df['brand'].fillna('Unbranded') |
|
df['sku'] = df['sku'].fillna('Missing_SKU') |
|
|
|
|
|
def safe_price_convert(price_str): |
|
if pd.isna(price_str): |
|
return np.nan |
|
try: |
|
|
|
if isinstance(price_str, str): |
|
if any(phrase in price_str.lower() for phrase in ['too low', 'na', 'not available']): |
|
return np.nan |
|
|
|
price_match = re.search(r'\$\d+\.?\d*', price_str) |
|
if price_match: |
|
return float(price_match.group(0).replace('$', '')) |
|
return float(price_str) |
|
except (ValueError, TypeError): |
|
return np.nan |
|
|
|
df['price'] = df['formatted_price'].apply(safe_price_convert) |
|
|
|
|
|
df['price'] = df.groupby('product_type_name')['price'].transform( |
|
lambda x: x.fillna(x.median())) |
|
|
|
df['price'] = df['price'].fillna(df['price'].median()) |
|
|
|
|
|
def extract_shipping_info(avail_str): |
|
if pd.isna(avail_str): |
|
return 7, False |
|
avail_str = str(avail_str).lower() |
|
if 'now' in avail_str: |
|
return 0, True |
|
numbers = re.findall(r'\d+', avail_str) |
|
days = int(numbers[0]) if numbers else 7 |
|
in_stock = 'out of stock' not in avail_str |
|
return days, in_stock |
|
|
|
shipping_results = df['availability'].apply(extract_shipping_info) |
|
df['shipping_days'] = shipping_results.apply(lambda x: x[0]) |
|
df['in_stock'] = shipping_results.apply(lambda x: x[1]) |
|
|
|
|
|
for img_col in ['large_image_url', 'medium_image_url', 'small_image_url']: |
|
if img_col in df.columns: |
|
df[img_col] = df[img_col].str.split(',').str[0] |
|
|
|
|
|
cols_to_drop = ['formatted_price', 'availability', 'availability_type', |
|
'editorial_review', 'editorial_reivew', 'publisher', 'author', |
|
'reviews'] |
|
df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1) |
|
|
|
|
|
print("Remaining missing values after cleaning:") |
|
print(df.isna().sum()) |
|
|
|
return df |
|
|
|
cleaned_metadata = clean_metadata(metadata) |
|
|
|
import os |
|
os.makedirs("assets", exist_ok=True) |
|
|
|
|
|
cleaned_metadata = clean_metadata(metadata) |
|
|
|
|
|
cleaned_metadata.to_csv("assets/cleaned_metadata.csv", index=False) |
|
|
|
|
|
def clean_reviews(reviews): |
|
df = reviews.copy() |
|
|
|
|
|
required_cols = ['asin', 'review_userId', 'review_score', 'review_text'] |
|
for col in required_cols: |
|
if col not in df.columns: |
|
raise ValueError(f"Required column {col} missing from reviews data") |
|
|
|
|
|
df['review_score'] = pd.to_numeric(df['review_score'], errors='coerce') |
|
df = df[df['review_score'].between(1, 5, inclusive='both')] |
|
|
|
|
|
df['review_text'] = df['review_text'].fillna('') |
|
df['review_summary'] = df['review_summary'].fillna('') |
|
|
|
|
|
df['review_length'] = df['review_text'].str.len() |
|
df['has_summary'] = df['review_summary'].str.len() > 0 |
|
|
|
|
|
if 'review_date' in df.columns: |
|
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce') |
|
|
|
|
|
df['review_userId'] = df['review_userId'].fillna('Anonymous') |
|
df['review_userId'] = df['review_userId'].str.strip() |
|
|
|
|
|
df = df.dropna(axis=1, how='all') |
|
|
|
print("\nReviews missing values after cleaning:") |
|
print(df.isna().sum()) |
|
|
|
return df |
|
|
|
cleaned_reviews = clean_reviews(reviews) |
|
|
|
"""## Study 1 β Similarity measures""" |
|
|
|
|
|
products = pd.merge( |
|
cleaned_metadata, |
|
cleaned_reviews.groupby('asin').agg({ |
|
'review_score': 'mean', |
|
'review_text': lambda x: ' '.join(x) |
|
}).reset_index(), |
|
on='asin', |
|
how='left' |
|
) |
|
|
|
|
|
sample_products = products.dropna().sample(100, random_state=42).copy() |
|
sample_products |
|
|
|
"""### Brand Similarity (Jaccard) |
|
|
|
The algorithm uses Jaccard similarity to measure brand similarity by comparing word overlap between brands. It ranks products based on how many brand words match relative to the total unique words. |
|
""" |
|
|
|
def jaccard_similarity(query_brand, comparison_set): |
|
query_set = set(query_brand.split()) |
|
similarity_scores = [] |
|
for brand in comparison_set: |
|
if pd.isna(brand): |
|
similarity_scores.append(0) |
|
continue |
|
comp_set = set(str(brand).split()) |
|
intersection = query_set.intersection(comp_set) |
|
union = query_set.union(comp_set) |
|
similarity_scores.append(len(intersection)/len(union) if union else 0) |
|
return similarity_scores |
|
|
|
def similar_brands(query_product_id, top_n=10): |
|
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0] |
|
query_brand = query_product['brand'] |
|
|
|
sample_products['brand_similarity'] = jaccard_similarity( |
|
query_brand, |
|
sample_products['brand'] |
|
) |
|
|
|
return sample_products.sort_values('brand_similarity', ascending=False)[[ |
|
'asin', 'title', 'brand', 'price', 'review_score', 'brand_similarity' |
|
]].head(top_n) |
|
|
|
"""### Price Similarity (Euclidean) |
|
|
|
This algorithm calculates price similarity using inverse absolute difference, giving higher scores to prices closer to the query product's price. It ranks products based on how near their prices are to the target price. |
|
""" |
|
|
|
def price_similarity(query_price, comparison_prices): |
|
return 1 / (1 + np.abs(query_price - comparison_prices)) |
|
|
|
def similar_prices(query_product_id, top_n=10): |
|
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0] |
|
query_price = query_product['price'] |
|
|
|
sample_products['price_similarity'] = price_similarity( |
|
query_price, |
|
sample_products['price'] |
|
) |
|
|
|
return sample_products.sort_values('price_similarity', ascending=False)[[ |
|
'asin', 'title', 'brand', 'price', 'review_score', 'price_similarity' |
|
]].head(top_n) |
|
|
|
"""### Color Similarity (Hamming) |
|
|
|
This algorithm uses normalized Hamming distance to compare color strings, measuring character-level similarity. It ranks products based on how closely their color descriptions match the query product's color. |
|
""" |
|
|
|
def hamming_similarity(query_color, comparison_colors): |
|
query_color = str(query_color).lower() |
|
similarity_scores = [] |
|
for color in comparison_colors: |
|
color = str(color).lower() |
|
max_len = max(len(query_color), len(color)) |
|
if max_len == 0: |
|
similarity_scores.append(0) |
|
continue |
|
distance = sum(c1 != c2 for c1, c2 in zip(query_color.ljust(max_len), color.ljust(max_len))) |
|
similarity_scores.append(1 - distance/max_len) |
|
return similarity_scores |
|
|
|
def similar_colors(query_product_id, top_n=10): |
|
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0] |
|
query_color = query_product['color'] |
|
|
|
sample_products['color_similarity'] = hamming_similarity( |
|
query_color, |
|
sample_products['color'] |
|
) |
|
|
|
return sample_products.sort_values('color_similarity', ascending=False)[[ |
|
'asin', 'title', 'color', 'brand', 'review_score', 'color_similarity' |
|
]].head(top_n) |
|
|
|
"""### Title Similarity (Levenshtein) |
|
|
|
his algorithm uses Levenshtein distance to measure title similarity by counting character insertions, deletions, or substitutions needed to match strings. It ranks products based on how closely their titles resemble the query product's title, normalized by length. |
|
""" |
|
|
|
def title_similarity(query_title, comparison_titles): |
|
query_title = str(query_title).lower() |
|
similarity_scores = [] |
|
for title in comparison_titles: |
|
title = str(title).lower() |
|
max_len = max(len(query_title), len(title)) |
|
if max_len == 0: |
|
similarity_scores.append(0) |
|
continue |
|
distance = levenshtein_distance(query_title, title) |
|
similarity_scores.append(1 - distance/max_len) |
|
return similarity_scores |
|
|
|
def similar_titles(query_product_id, top_n=10): |
|
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0] |
|
query_title = query_product['title'] |
|
|
|
sample_products['title_similarity'] = title_similarity( |
|
query_title, |
|
sample_products['title'] |
|
) |
|
|
|
return sample_products.sort_values('title_similarity', ascending=False)[[ |
|
'asin', 'title', 'brand', 'price', 'review_score', 'title_similarity' |
|
]].head(top_n) |
|
|
|
"""### Review Text Similarity (TF-IDF Cosine) |
|
|
|
This algorithm uses TF-IDF vectorization and cosine similarity to compare review texts, identifying products with semantically similar reviews. It ranks products based on how closely their review content matches the query product's review in terms of key terms and their importance. |
|
""" |
|
|
|
def prepare_tfidf_matrix(texts): |
|
tfidf = TfidfVectorizer(stop_words='english', max_features=1000) |
|
return tfidf.fit_transform(texts.fillna('')) |
|
|
|
def review_similarity(query_text, tfidf_matrix, index): |
|
query_vec = tfidf.transform([query_text]) |
|
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten() |
|
return similarities |
|
|
|
def similar_reviews(query_product_id, top_n=10): |
|
query_product = sample_products[sample_products['asin'] == query_product_id].iloc[0] |
|
query_text = query_product['review_text'] |
|
|
|
|
|
tfidf = TfidfVectorizer(stop_words='english', max_features=1000) |
|
tfidf_matrix = tfidf.fit_transform(sample_products['review_text'].fillna('')) |
|
|
|
|
|
query_vec = tfidf.transform([query_text]) |
|
|
|
|
|
sample_products['review_similarity'] = cosine_similarity( |
|
query_vec, |
|
tfidf_matrix |
|
).flatten() |
|
|
|
return sample_products.sort_values('review_similarity', ascending=False)[[ |
|
'asin', 'title', 'brand', 'review_score', 'review_similarity' |
|
]].head(top_n) |
|
|
|
"""### Simulation Requests""" |
|
|
|
|
|
query_products = sample_products.sample(5, random_state=42) |
|
|
|
print("=== Similarity Study Results ===") |
|
|
|
|
|
product_id = query_products.iloc[0]['asin'] |
|
print(f"\n1. Products with similar brands to '{query_products.iloc[0]['title']}':") |
|
|
|
|
|
|
|
product_id = query_products.iloc[1]['asin'] |
|
print(f"\n2. Products with similar prices to '{query_products.iloc[1]['title']}':") |
|
|
|
|
|
|
|
product_id = query_products.iloc[2]['asin'] |
|
print(f"\n3. Products with similar colors to '{query_products.iloc[2]['title']}':") |
|
|
|
|
|
|
|
product_id = query_products.iloc[3]['asin'] |
|
print(f"\n4. Products with similar titles to '{query_products.iloc[3]['title']}':") |
|
|
|
|
|
|
|
product_id = query_products.iloc[4]['asin'] |
|
print(f"\n5. Products with similar reviews to '{query_products.iloc[4]['title']}':") |
|
|
|
|
|
"""#### Analysis of Similarity Study Results |
|
|
|
**1. Brand Similarity** |
|
The Jaccard similarity successfully identified exact brand matches (100% similarity for "Jonathan Corey") while correctly giving 0 similarity to unrelated brands. However, it fails to detect potential partial matches or parent/subsidiary brand relationships that might be helpful. |
|
|
|
**2. Price Similarity** |
|
The inverse distance method effectively clustered products within a small price range (Β±$1), with similarity scores dropping rapidly beyond that threshold. This works well for exact price matches but could be inproved from logarithmic scaling for broader price categories. |
|
|
|
**3. Color Similarity** |
|
The Hamming distance approach perfectly matched identical color descriptions ("Light Blue"), but showed limitations with: |
|
- Only 50% similarity for colors containing the same root word ("Night" vs "Night 846") |
|
- Poor handling of conceptually similar but lexically different colors (e.g., "Pink" vs "Light Blue") |
|
|
|
**4. Title Similarity** |
|
Levenshtein distance achieved perfect matching for identical listings while effectively ranking: |
|
- Partial matches with shared keywords ("blouse", "shoulder") |
|
- Same product in different colors/sizes |
|
- Gradually decreasing scores for more distant product types |
|
|
|
**5. Review Similarity** |
|
The TF-IDF/cosine similarity approach showed: |
|
- Perfect match for identical products |
|
- Moderate similarity for reviews mentioning fabric types ("silk", "pique") |
|
- Low but non-zero scores for unrelated products, suggesting the vector space can capture some latent semantic relationships |
|
|
|
## Study 2 β Clustering algorithms |
|
""" |
|
|
|
|
|
products = pd.merge( |
|
cleaned_metadata, |
|
cleaned_reviews.groupby('asin')['review_score'].mean().reset_index(), |
|
on='asin', |
|
how='left' |
|
) |
|
|
|
|
|
products['brand_popularity'] = products.groupby('brand')['brand'].transform('count') |
|
products['color_popularity'] = products.groupby('color')['color'].transform('count') |
|
|
|
|
|
cluster_data = products[['price', 'review_score', 'brand_popularity', 'color_popularity']].dropna() |
|
|
|
|
|
scaler = StandardScaler() |
|
scaled_data = scaler.fit_transform(cluster_data) |
|
|
|
|
|
plt.figure(figsize=(15, 5)) |
|
|
|
|
|
plt.subplot(1, 2, 1) |
|
kmeans3 = KMeans(n_clusters=3, random_state=42) |
|
clusters = kmeans3.fit_predict(scaled_data[:, [0, 1]]) |
|
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis') |
|
plt.title('KMeans (k=3) - Price vs Review Score') |
|
plt.xlabel('Price') |
|
plt.ylabel('Review Score') |
|
|
|
|
|
plt.subplot(1, 2, 2) |
|
kmeans5 = KMeans(n_clusters=5, random_state=42) |
|
clusters = kmeans5.fit_predict(scaled_data[:, [0, 1]]) |
|
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis') |
|
plt.title('KMeans (k=5) - Price vs Review Score') |
|
plt.xlabel('Price') |
|
|
|
plt.tight_layout() |
|
plt.show() |
|
|
|
""" |
|
|
|
The scatter plots above visualize the results of KMeans clustering applied to a dataset with two features: **Price** (x-axis) and **Review Score** (y-axis). Two clustering configurations are compared: `k=3` and `k=5`. |
|
|
|
|
|
#### 1. **Customer Segmentation Patterns** |
|
- A large concentration of data points exists in the **low-price, high-review** segment. This suggests that a significant portion of items (e.g., products, services) are **affordable and well-rated**, potentially indicating strong value for money. |
|
- Conversely, there are items with **low prices but low review scores**, representing a segment of potentially low-quality, budget offerings. |
|
- High-priced items are relatively rare and scattered across different review scores, indicating **price alone does not guarantee high customer satisfaction**. |
|
|
|
#### 2. **Non-Linear Relationship Between Price and Quality** |
|
- The clusters do not align along a simple trend line (e.g., increasing price with increasing review score). Instead, they show that **review score and price are not strongly correlated**. |
|
- This suggests that **higher prices do not consistently lead to better reviews**, and customers may find high value in lower-priced items. |
|
|
|
#### 3. **Discrete Review Score Distribution** |
|
- The presence of clear horizontal bands in both plots implies that review scores are **categorical or ordinal** rather than continuous. This may reflect the use of a **fixed-scale rating system** (e.g., 1 to 5 stars), which influences how clustering interprets vertical groupings. |
|
|
|
#### 4. **Effect of Varying k** |
|
- With `k=3`, the model captures **general segments**: (a) low priceβlow rating, (b) low priceβhigh rating, and (c) higher price products. |
|
- With `k=5`, the model reveals **finer-grained distinctions** within those segments. For example, it identifies subgroups within the high review score group that differ by price, and isolates extreme outliers (e.g., high priceβlow rating). |
|
|
|
|
|
#### 5. **Presence of Outliers** |
|
- A small number of points are priced extremely high (e.g., above $400). These do not form large clusters, suggesting they are **niche offerings** or **premium products** that deviate from the rest of the dataset. |
|
- These outliers could impact clustering quality if not treated or analyzed separately. |
|
|
|
</br> |
|
|
|
|
|
Overall, the clustering analysis reveals that: |
|
|
|
- **Most offerings are low-priced and highly rated**, indicating a market saturated with strong low-cost options. |
|
- **Price is not a reliable indicator of review score**, so other features (e.g., brand, product category, location) may be important to model user satisfaction. |
|
- The data supports both general segmentation (k=3) and more refined subgroup analysis (k=5), depending on the use case. |
|
|
|
""" |
|
|
|
|
|
plt.figure(figsize=(15, 5)) |
|
|
|
|
|
plt.subplot(1, 2, 1) |
|
dbscan1 = DBSCAN(eps=0.5, min_samples=5) |
|
clusters = dbscan1.fit_predict(scaled_data[:, [0, 1]]) |
|
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis') |
|
plt.title('DBSCAN (eps=0.5, min=5) - Price vs Review Score') |
|
plt.xlabel('Price') |
|
plt.ylabel('Review Score') |
|
|
|
|
|
plt.subplot(1, 2, 2) |
|
dbscan2 = DBSCAN(eps=1.0, min_samples=10) |
|
clusters = dbscan2.fit_predict(scaled_data[:, [0, 1]]) |
|
plt.scatter(cluster_data['price'], cluster_data['review_score'], c=clusters, cmap='viridis') |
|
plt.title('DBSCAN (eps=1.0, min=10) - Price vs Review Score') |
|
plt.xlabel('Price') |
|
|
|
plt.tight_layout() |
|
plt.show() |
|
|
|
""" |
|
|
|
The plots above show clustering results using **DBSCAN (Density-Based Spatial Clustering of Applications with Noise)**. The two plots compare results using different parameter configurations for price and review: |
|
|
|
|
|
#### Left Plot: DBSCAN (eps = 0.5, min_samples = 5) |
|
- The majority of the data points are grouped into **a single dense cluster**, primarily composed of **low to mid-price, high-review score** entries. |
|
- A few **small, separate clusters** form among high-priced items with high review scores. |
|
- A significant number of **points are labeled as noise** (purple), especially in **higher price regions** or in **sparse review score segments**. |
|
- This configuration captures local groupings well but is **over-labelling sparse but valid data as noise**, especially outliers in pricing. |
|
|
|
|
|
#### Right Plot: DBSCAN (eps = 1.0, min_samples = 10) |
|
- A **single dominant cluster** is formed that captures almost all data points, regardless of price or review score. |
|
- Very few points are labeled as noise, indicating that the increased radius (`eps`) and required density (`min_samples`) lead to a **broader, less selective clustering**. |
|
- This setting likely **masks meaningful substructure** in the data by treating almost everything as part of one large group. |
|
- The lack of distinct clusters suggests the data may be **globally dense but lacks local density variation** that DBSCAN can exploits. |
|
|
|
|
|
|
|
### What These Results Reveal About the Data |
|
|
|
#### 1. **High-Density Core Around Low Price & High Review** |
|
- Both parameter settings highlight a **dense cluster** of items with **low prices and high review scores**, confirming a dominant segment in the data where products are affordable and well-reviewed. |
|
- This suggests that in this market, **value products dominate**, and this group may reflect a competitive or saturated segment. |
|
|
|
#### 2. **Sparse Distribution of High-Price Items** |
|
- High-priced items are **sparsely distributed** across the dataset and are often marked as **outliers** by DBSCAN. |
|
- This implies that **premium offerings are rare and isolated**, possibly catering to niche customer bases. |
|
|
|
#### 3. **Review Scores Are Not Density Drivers** |
|
- Despite the discrete nature of review scores, DBSCAN does not form distinct horizontal clusters along review bands. |
|
- This suggests that **review score alone does not define local density**, and DBSCAN prioritizes **price variations** more heavily in this context. |
|
|
|
<br> |
|
|
|
Overall, the DBscan algorithim reveals: |
|
|
|
- The dataset has one dominant group of **low-cost, high-review** items. |
|
- High-priced items are generally **isolated and not naturally clusterable**, as shown by their frequent classification as outliers. |
|
- DBSCAN is useful here for **identifying core clusters and outliers**, but it struggles to find multiple natural groupings unless thereβs strong local density variation. |
|
- **Alternative clustering algorithms (e.g., KMeans or hierarchical clustering)** may be more appropriate when the goal is to **partition the dataset into interpretable segments**. |
|
|
|
""" |
|
|
|
|
|
plt.figure(figsize=(15, 5)) |
|
|
|
|
|
plt.subplot(1, 2, 1) |
|
kmeans4 = KMeans(n_clusters=4, random_state=42) |
|
clusters = kmeans4.fit_predict(scaled_data[:, [2, 3]]) |
|
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma') |
|
plt.title('KMeans (k=4) - Brand vs Color Popularity') |
|
plt.xlabel('Brand Popularity') |
|
plt.ylabel('Color Popularity') |
|
|
|
|
|
plt.subplot(1, 2, 2) |
|
kmeans6 = KMeans(n_clusters=6, random_state=42) |
|
clusters = kmeans6.fit_predict(scaled_data[:, [2, 3]]) |
|
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma') |
|
plt.title('KMeans (k=6) - Brand vs Color Popularity') |
|
plt.xlabel('Brand Popularity') |
|
|
|
plt.tight_layout() |
|
plt.show() |
|
|
|
""" |
|
|
|
The scatter plots above visualize the results of KMeans clustering applied to a dataset with two features: **Brand Popularity** (x-axis) and **Color Popularity** (y-axis). Two clustering configurations are compared: `k=4` and `k=6`. |
|
|
|
|
|
#### General Observations: |
|
- Data points are concentrated in **three distinct horizontal bands** on the y-axis (Color Popularity): near 0, ~3000, and ~5000. |
|
- This suggests that color popularity has **three major popularity levels**, potentially indicating fixed tiers (e.g., low, medium, high demand for certain color groups). |
|
- Brand popularity is more **evenly distributed** and continuous, with values ranging from 0 to above 120. |
|
|
|
|
|
|
|
#### KMeans with k = 4: |
|
- The model identifies clusters based primarily on **Brand Popularity** within each Color Popularity tier. |
|
- In the lowest Color Popularity tier (y β 0β1000), three clusters are visible based on varying levels of Brand Popularity. |
|
- High Color Popularity bands (y β 3000 and 5000) are grouped more broadly, suggesting less differentiation in Brand Popularity at those levels. |
|
- This implies that **color popularity may dominate** the cluster formation, especially in higher tiers. |
|
|
|
|
|
#### KMeans with k = 6: |
|
- Increasing k introduces **finer distinctions** within the bands: |
|
- The low Color Popularity band is now split into multiple clusters across Brand Popularity. |
|
- The mid and high Color Popularity bands are also divided, allowing segmentation based on brand performance even among highly popular colors. |
|
- This configuration reveals more **detailed market segmentation**, showing that even within popular color categories, **brand popularity can vary significantly**. |
|
|
|
|
|
|
|
### What These Results Reveal About the Data |
|
|
|
#### 1. **Color Popularity Exhibits Discrete Behavior** |
|
- The presence of horizontal bands strongly suggests that Color Popularity is either **categorical** or derived from fixed-count groupings. |
|
- This could mean colors are grouped by tiers (e.g., top 10 most popular, moderate interest, niche colors). |
|
|
|
#### 2. **Brand Popularity Drives Subgroup Variation** |
|
- Within each color tier, Brand Popularity introduces meaningful variation. |
|
- Clusters along the x-axis (Brand Popularity) reflect **differentiation between strong, mid-tier, and weak brands** within each color group. |
|
|
|
#### 3. **Increasing k Uncovers Latent Substructure** |
|
- The shift from 4 to 6 clusters shows that the dataset supports **finer segmentation**, particularly helpful for identifying micro-markets or personalized targeting. |
|
- This may be useful in **marketing or inventory strategies**, such as deciding which color-brand combinations to promote or produce more of. |
|
|
|
#### 4. **Low Color Popularity Shows Greatest Brand Spread** |
|
- The widest spread of Brand Popularity is found in the lowest color tier (bottom band), indicating that **brands try to differentiate themselves in less popular color spaces**. |
|
- This could suggest **brand experimentation or niche targeting** in low-demand color segments. |
|
|
|
</br> |
|
|
|
|
|
Overall, the clustering analysis reveals that: |
|
|
|
- **Color Popularity appears categorical or tiered**, heavily influencing the overall clustering structure. |
|
- **Brand Popularity introduces meaningful variation within each color tier**, especially in the low and mid segments. |
|
- Using a higher value of `k` (e.g., 6) captures more detailed behavior patterns, allowing **richer segmentation** for strategic decision-making in areas like marketing, design, or inventory management. |
|
|
|
""" |
|
|
|
|
|
plt.figure(figsize=(15, 5)) |
|
|
|
|
|
plt.subplot(1, 2, 1) |
|
dbscan3 = DBSCAN(eps=0.3, min_samples=5) |
|
clusters = dbscan3.fit_predict(scaled_data[:, [2, 3]]) |
|
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma') |
|
plt.title('DBSCAN (eps=0.3, min=5) - Brand vs Color Popularity') |
|
plt.xlabel('Brand Popularity') |
|
plt.ylabel('Color Popularity') |
|
|
|
|
|
plt.subplot(1, 2, 2) |
|
dbscan4 = DBSCAN(eps=0.5, min_samples=10) |
|
clusters = dbscan4.fit_predict(scaled_data[:, [2, 3]]) |
|
plt.scatter(cluster_data['brand_popularity'], cluster_data['color_popularity'], c=clusters, cmap='plasma') |
|
plt.title('DBSCAN (eps=0.5, min=10) - Brand vs Color Popularity') |
|
plt.xlabel('Brand Popularity') |
|
|
|
plt.tight_layout() |
|
plt.show() |
|
|
|
"""The above scatter plots display clustering results using the DBSCAN algorithm on features **Brand Popularity** (x-axis) and **Color Popularity** (y-axis). Two parameter configurations are compared to understand how density-based clustering interacts with the structure of the data. |
|
|
|
|
|
#### Left Plot: DBSCAN (eps = 0.3, min_samples = 5) |
|
- **Three main clusters** are detected, corresponding almost exactly to three distinct horizontal bands in color popularity (~0β1000, ~3000, ~5000). |
|
- DBSCAN is able to **clearly separate color popularity tiers** due to their high intra-group density and large inter-group spacing. |
|
- The clustering shows minimal segmentation based on brand popularity within each band. |
|
- This suggests that, under tighter density constraints, **Color Popularity dominates** as the primary clustering feature. |
|
|
|
|
|
#### Right Plot: DBSCAN (eps = 0.5, min_samples = 10) |
|
- The broader radius (`eps = 0.5`) and higher density requirement (`min_samples = 10`) result in **three broader clusters**, once again aligned with the three tiers of color popularity. |
|
- More points are included in clusters (fewer outliers), but **brand popularity still plays a minimal role** in segmentation. |
|
- The result reinforces the observation that the dataβs natural clustering structure is **horizontally stratified** (based on color tiers), rather than vertically or diagonally segmented by brand. |
|
|
|
|
|
|
|
### What These Results Reveal About the Data |
|
|
|
#### 1. **Color Popularity Has Strong Tier-Based Structure** |
|
- DBSCAN easily isolates the three dominant horizontal bands, confirming that Color Popularity follows a **discrete or tiered structure**, likely corresponding to fixed popularity levels or thresholds. |
|
- These bands are consistent across parameter variations, indicating **high local density** within each tier. |
|
|
|
#### 2. **Brand Popularity is Continuously Distributed** |
|
- Unlike color popularity, brand popularity shows **no natural density-based groupings** and appears more uniformly spread. |
|
- DBSCAN is unable to meaningfully cluster points based on brand popularity alone, suggesting brand popularity behaves more like a **continuous or evenly distributed variable**. |
|
|
|
#### 3. **Clusters are Robust to Parameter Change** |
|
- The overall structure of the clusters is **stable across different DBSCAN configurations**, suggesting that the data has a **strong, intrinsic density-based structure**. |
|
- DBSCAN naturally ignores noise and outliers without forcing cluster assignments, making it well-suited for separating out **distinct, dense subgroups** like the color tiers. |
|
|
|
#### 4. **KMeans vs DBSCAN: Key Differences** |
|
- Compared to KMeans, which segmented along both axes, DBSCAN focuses on **density within localized areas**, revealing that only **Color Popularity exhibits true density-based clustering**. |
|
- KMeans may be more useful for **market segmentation** involving continuous brand dynamics, while DBSCAN is better for **identifying fixed or bounded clusters**, such as popularity tiers. |
|
|
|
<br> |
|
|
|
Overall, the DBscan algorithim reveals: |
|
|
|
- The data reveals **three strong, dense groupings in Color Popularity**, which DBSCAN captures effectively. |
|
- **Brand Popularity lacks density-based clustering structure**, and is better interpreted as a continuous variable. |
|
- DBSCAN is effective in detecting **tiered categorical-like patterns**, but less useful when segmentation depends on evenly distributed features like brand popularity. |
|
|
|
##Study 3 β Content-Based Recommendation System |
|
""" |
|
|
|
|
|
print("Available columns in products DataFrame:") |
|
print(products.columns.tolist()) |
|
|
|
|
|
if 'brand_popularity' not in products.columns: |
|
products['brand_popularity'] = products.groupby('brand')['brand'].transform('count') |
|
if 'color_popularity' not in products.columns: |
|
products['color_popularity'] = products.groupby('color')['color'].transform('count') |
|
|
|
|
|
products['price'] = pd.to_numeric(products['price'], errors='coerce') |
|
products['review_score'] = pd.to_numeric(products['review_score'], errors='coerce') |
|
products = products.dropna(subset=['price', 'review_score']) |
|
|
|
"""This hybrid recommendation algorithm combines price proximity, review score similarity, and brand similarity into a weighted score to suggest products. It uses inverse distance for price/review comparisons and Jaccard similarity for brand name matching. The final recommendations balance affordability (50% weight), quality (30%), and brand relevance (20%) while excluding the query product itself. |
|
|
|
|
|
""" |
|
|
|
def price_quality_recommendations(query_asin, df=products, top_n=10): |
|
"""Recommend products based on price, review score, and brand similarity""" |
|
query = df[df['asin'] == query_asin].iloc[0] |
|
|
|
|
|
df['price_sim'] = 1 / (1 + np.abs(df['price'] - query['price'])) |
|
df['review_sim'] = 1 / (1 + np.abs(df['review_score'] - query['review_score'])) |
|
|
|
|
|
query_brand_words = set(str(query['brand']).lower().split()) |
|
df['brand_sim'] = df['brand'].apply( |
|
lambda x: len(query_brand_words.intersection(set(str(x).lower().split()))) / |
|
len(query_brand_words.union(set(str(x).lower().split()))) if pd.notna(x) else 0 |
|
) |
|
|
|
|
|
df['pq_score'] = 0.5*df['price_sim'] + 0.3*df['review_sim'] + 0.2*df['brand_sim'] |
|
|
|
|
|
return df[df['asin'] != query_asin].sort_values('pq_score', ascending=False)[[ |
|
'asin', 'title', 'brand', 'price', 'review_score', 'pq_score' |
|
]].head(top_n) |
|
|
|
"""This algorithm recommends stylistically similar products by combining title similarity (40% weight using Levenshtein distance), color matching (30% exact match), and brand popularity alignment (30% inverse distance). It identifies items with comparable descriptions, matching colors, and similarly prestigious brands while excluding the query product itself.""" |
|
|
|
def style_popularity_recommendations(query_asin, df=products, top_n=10): |
|
"""Recommend products based on title similarity, color, and brand popularity""" |
|
query = df[df['asin'] == query_asin].iloc[0] |
|
|
|
def style_popularity_recommendations(query_asin, df=products, top_n=10): |
|
"""Recommend products based on title similarity, color, and brand popularity""" |
|
query = df[df['asin'] == query_asin].iloc[0] |
|
|
|
|
|
query_title = str(query['title']).lower() |
|
df['title_sim'] = df['title'].apply( |
|
lambda x: 1 - (levenshtein_distance(query_title, str(x).lower()) / |
|
max(len(query_title), len(str(x)), 1)) |
|
) |
|
|
|
|
|
query_color = str(query['color']).lower() |
|
df['color_sim'] = (df['color'].str.lower() == query_color).astype(float) |
|
|
|
|
|
df['brand_pop_sim'] = 1 / (1 + np.abs(df['brand_popularity'] - query['brand_popularity'])) |
|
|
|
|
|
df['sp_score'] = 0.4*df['title_sim'] + 0.3*df['color_sim'] + 0.3*df['brand_pop_sim'] |
|
|
|
|
|
return df[df['asin'] != query_asin].sort_values('sp_score', ascending=False)[[ |
|
'asin', 'title', 'color', 'brand', 'brand_popularity', 'sp_score' |
|
]].head(top_n) |
|
|
|
|
|
|
|
|
|
query_color = str(query['color']).lower() |
|
df['color_sim'] = (df['color'].str.lower() == query_color).astype(float) |
|
|
|
|
|
df['brand_pop_sim'] = 1 / (1 + np.abs(df['brand_popularity'] - query['brand_popularity'])) |
|
|
|
|
|
df['sp_score'] = 0.4*df['title_sim'] + 0.3*df['color_sim'] + 0.3*df['brand_pop_sim'] |
|
|
|
|
|
return df[df['asin'] != query_asin].sort_values('sp_score', ascending=False)[[ |
|
'asin', 'title', 'color', 'brand', 'brand_popularity', 'sp_score' |
|
]].head(top_n) |
|
|
|
sample_products = products.dropna(subset=['title', 'brand', 'color']).sample(3, random_state=42) |
|
|
|
for i, (_, row) in enumerate(sample_products.iterrows(), 1): |
|
print(f"\n=== Query {i} ===") |
|
print(f"Product: {row['title']}") |
|
print(f"Brand: {row['brand']}, Color: {row['color']}") |
|
print(f"Price: ${row['price']:.2f}, Rating: {row['review_score']:.1f}") |
|
|
|
print("\nPrice & Quality Recommendations:") |
|
|
|
|
|
print("\nStyle & Popularity Recommendations:") |
|
|
|
|
|
"""### Analysis of Results |
|
|
|
#### 1. Query 1 β *Belle By Badgley Mischka Top ($87.99, Rating: 2.0)* |
|
|
|
**Price & Quality Recommendations:** |
|
- Mostly same-brand items or similar premium brands (e.g., Bailey 44, Josie Natori) |
|
- Closely matched prices ($87β$88 range) |
|
- Review scores varied significantly (1.0β5.0), despite the low query rating |
|
- Top match was same brand and price but with a better rating (4.0) |
|
|
|
**Style & Popularity Recommendations:** |
|
- Focused on the same brand and other high-end designers |
|
- Similar color palette (e.g., black, coral) |
|
- All from brands with popularity score **9** (highest tier) |
|
- Strong brand consistency but limited style diversity |
|
|
|
|
|
|
|
#### 2. Query 2 β *G.H. Bass Striped Tee ($26.96, Rating: 4.3)* |
|
|
|
**Price & Quality Recommendations:** |
|
- Excellent price matching (Β±$0.25 in top results) |
|
- High and consistent review scores (4.25β4.52) |
|
- Diverse range of styles (e.g., lace, baby doll, athletic) |
|
- Included both casual and dressy options |
|
|
|
**Style & Popularity Recommendations:** |
|
- Greater brand diversity compared to Query 1 |
|
- Focused on similar striped patterns |
|
- All from **mid-popularity brands** (score 3) |
|
- Color matching was less precise than with premium brand queries |
|
|
|
|
|
|
|
#### 3. Query 3 β *JSDY Bohemian Blouse ($15.88, Rating: 5.0)* |
|
|
|
**Price & Quality Recommendations:** |
|
- Near-perfect price matching (Β±$0.11 in top result) |
|
- All recommended items had perfect 5.0 ratings |
|
- Strong style consistency (bohemian aesthetics) |
|
- Included floral and chiffon designs |
|
|
|
**Style & Popularity Recommendations:** |
|
- Excellent color matching (all white tops) |
|
- Featured batwing/flowy sleeves similar to the query |
|
- All from **lower-popularity brands** (score 3) |
|
- More niche/indie labels than in other queries |
|
|
|
|
|
|
|
**Notes** |
|
|
|
1. **Price matching** is most effective at **lower price points** |
|
2. Queries with **high-end brands** produce **brand-loyal recommendations** |
|
3. Items with **perfect review scores** receive similarly rated suggestions |
|
4. **Brand popularity scores** establish **clear tier separation** |
|
5. **Pandas warnings** indicate opportunities for **code optimization** |
|
|
|
## Study 4 β Collaborative Filtering Recommendation System |
|
""" |
|
|
|
|
|
print("Creating user-item matrix...") |
|
user_item_matrix = cleaned_reviews.pivot_table( |
|
index='review_userId', |
|
columns='asin', |
|
values='review_score', |
|
fill_value=0 |
|
) |
|
|
|
|
|
R = csr_matrix(user_item_matrix.values) |
|
user_ids = user_item_matrix.index.values |
|
item_ids = user_item_matrix.columns.values |
|
|
|
|
|
print("Splitting data into train and test sets...") |
|
train_reviews, test_reviews = train_test_split( |
|
cleaned_reviews, |
|
test_size=0.1, |
|
random_state=42 |
|
) |
|
|
|
|
|
def create_sparse_matrix(df, user_ids, item_ids): |
|
"""Create sparse matrix from DataFrame""" |
|
user_mapping = {u: i for i, u in enumerate(user_ids)} |
|
item_mapping = {m: i for i, m in enumerate(item_ids)} |
|
|
|
rows = df['review_userId'].map(user_mapping) |
|
cols = df['asin'].map(item_mapping) |
|
values = df['review_score'] |
|
|
|
return coo_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids))) |
|
|
|
R_train = create_sparse_matrix(train_reviews, user_ids, item_ids) |
|
R_test = create_sparse_matrix(test_reviews, user_ids, item_ids) |
|
|
|
|
|
R_train = R_train.tocsr() |
|
R_test = R_test.tocsr() |
|
|
|
|
|
print("Performing matrix factorization...") |
|
n_factors = 15 |
|
model = NMF( |
|
n_components=n_factors, |
|
init='random', |
|
random_state=42, |
|
max_iter=500 |
|
) |
|
P = model.fit_transform(R_train) |
|
Q = model.components_ |
|
|
|
|
|
def evaluate_predictions(P, Q, test_matrix): |
|
"""Calculate MSE between predicted and actual ratings""" |
|
test_coo = test_matrix.tocoo() |
|
predicted = np.array([P[row].dot(Q[:, col]) |
|
for row, col in zip(test_coo.row, test_coo.col)]) |
|
actual = test_coo.data |
|
return mean_squared_error(actual, predicted) |
|
|
|
|
|
print("Evaluating recommendations...") |
|
test_coo = R_test.tocoo() |
|
test_predicted = np.array([P[row].dot(Q[:, col]) |
|
for row, col in zip(test_coo.row, test_coo.col)]) |
|
test_actual = test_coo.data |
|
|
|
mse = evaluate_predictions(P, Q, R_test) |
|
print(f"MSE: {mse:.4f}") |
|
|
|
"""This function generates personalized product recommendations for a given user based on matrix factorization (using matrices P and Q). It predicts ratings for unrated items by the user, filters out already rated products, and returns the top-k recommendations with the highest predicted ratings. The output includes product details like title, brand, and price, sorted by predicted rating in descending order. If the user ID isn't found, it returns None with an error message. |
|
|
|
|
|
""" |
|
|
|
|
|
def get_recommendations(user_id, P, Q, user_item_matrix, product_data, k=10): |
|
"""Generate top-k recommendations for a user""" |
|
if user_id not in user_ids: |
|
print(f"User {user_id} not found in dataset") |
|
return None |
|
|
|
user_idx = np.where(user_ids == user_id)[0][0] |
|
user_ratings = user_item_matrix.values[user_idx] |
|
|
|
|
|
predicted_ratings = P[user_idx].dot(Q) |
|
|
|
|
|
unrated_mask = user_ratings == 0 |
|
predicted_unrated = predicted_ratings[unrated_mask] |
|
unrated_items = item_ids[unrated_mask] |
|
|
|
|
|
top_indices = np.argsort(predicted_unrated)[-k:][::-1] |
|
recommendations = pd.DataFrame({ |
|
'asin': unrated_items[top_indices], |
|
'predicted_rating': predicted_unrated[top_indices] |
|
}).merge(product_data[['asin', 'title', 'brand', 'price']], on='asin') |
|
|
|
return recommendations.sort_values('predicted_rating', ascending=False) |
|
|
|
|
|
sample_users = cleaned_reviews['review_userId'].drop_duplicates().sample(3, random_state=42) |
|
|
|
for user_id in sample_users: |
|
print(f"\n=== Recommendations for User {user_id} ===") |
|
|
|
|
|
user_ratings = cleaned_reviews[cleaned_reviews['review_userId'] == user_id] |
|
top_rated = user_ratings.sort_values('review_score', ascending=False).head(3) |
|
print("\nUser's top rated items:") |
|
|
|
|
|
|
|
|
|
recs = get_recommendations(user_id, P, Q, user_item_matrix, cleaned_metadata) |
|
print("\nTop recommended items:") |
|
|
|
|
|
"""#### Analysis of Result |
|
|
|
1. User AJU5WLLEHPZ3X |
|
- Profile: Shows preference for affordable women's graphic tees ($5-$10 range) |
|
- Top rated items: Special Olympics and My Chemical Romance themed shirts |
|
- Recommendations: |
|
* Mostly fashion items priced $12-$40 |
|
* Includes sportswear (Under Armour tanks) |
|
* Some boho-style items (floral caftan, kimono cover-up) |
|
* Predicted ratings show strong confidence (1.4-2.9 range) |
|
* Price points higher than user's purchase history |
|
|
|
2. User A35TTLP3VIFEOB |
|
- Profile: Purchased budget-friendly women's tops and band merch |
|
- Top rated items: Color-block striped top and concert poster shirt |
|
- Recommendations: |
|
* Mix of fashion styles (lace, sheer, striped tops) |
|
* Includes both affordable ($12) and premium ($90) items |
|
* One clearly inappropriate recommendation (adult toy) |
|
* Predicted ratings show moderate confidence (1.0-2.1 range) |
|
* Some brand consistency (Under Armour appears in both history and recs) |
|
|
|
3. User A1P9JI4TR933JX |
|
- Profile: Only one rated item (velvet hi-low top) |
|
- Top rated item: 9.49 fashion top |
|
- Recommendations: |
|
* All predictions show zero confidence (0.0 ratings) |
|
* Wide variety of styles with no clear pattern |
|
* Price points vary dramatically ($8-$59) |
|
* Includes some premium brands (Jessica Simpson) |
|
* Demonstrates classic cold-start problem |
|
|
|
System Performance Analysis: |
|
- Works best for users with multiple purchases |
|
- Shows some ability to match styles (graphic tees β sportswear) |
|
- Struggles with: |
|
* Price point matching |
|
* Niche preferences |
|
* Users with limited history |
|
* Filtering inappropriate content |
|
|
|
## *References* |
|
|
|
Collaborative Filtering in Python" β Krish Naik |
|
https://www.youtube.com/watch?v=3ja4C3BWeEg |
|
|
|
</br> |
|
|
|
"Matrix Factorization for Recommendations" β Aladdin Persson |
|
https://www.youtube.com/watch?v=ZspR5PZemcs |
|
|
|
</br> |
|
|
|
|
|
"Content-Based RecSys with TF-IDF & Cosine Similarity" β Data Science Dojo |
|
https://www.youtube.com/watch?v=XoTwndOgXBM |
|
|
|
</br> |
|
|
|
"Building a Book Recommender" β James Briggs |
|
https://www.youtube.com/watch?v=1qHStu_pTv8 |
|
""" |