Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN | |
from sklearn.preprocessing import RobustScaler | |
from sklearn.metrics.pairwise import euclidean_distances | |
# Load and preprocess data | |
data = pd.read_csv("scaled_dataset.csv") | |
data.drop("CUST_ID", axis=1, inplace=True) | |
selected_features = [ | |
"BALANCE", | |
"BALANCE_FREQUENCY", | |
"PURCHASES", | |
"ONEOFF_PURCHASES", | |
"INSTALLMENTS_PURCHASES", | |
"CASH_ADVANCE", | |
"PURCHASES_FREQUENCY", | |
"ONEOFF_PURCHASES_FREQUENCY", | |
"PURCHASES_INSTALLMENTS_FREQUENCY", | |
"CASH_ADVANCE_FREQUENCY", | |
"CASH_ADVANCE_TRX", | |
"PURCHASES_TRX", | |
"CREDIT_LIMIT", | |
"PAYMENTS", | |
"MINIMUM_PAYMENTS", | |
"PRC_FULL_PAYMENT", | |
"TENURE" | |
] | |
X = data[selected_features].values | |
# Scale features | |
scaler = RobustScaler() | |
X_scaled = scaler.fit_transform(X) | |
# Pre-fit KMeans model and compute cluster meaning | |
kmeans_model = KMeans(n_clusters=2, random_state=42) | |
kmeans_model.fit(X_scaled) | |
all_labels = kmeans_model.predict(X_scaled) | |
cluster0_balance = data.loc[all_labels == 0, "BALANCE"].mean() | |
cluster1_balance = data.loc[all_labels == 1, "BALANCE"].mean() | |
if cluster0_balance > cluster1_balance: | |
cluster_meaning = {0: "High Spend Customer", 1: "Low Spend Customer"} | |
else: | |
cluster_meaning = {1: "High Spend Customer", 0: "Low Spend Customer"} | |
# Pre-fit DBSCAN and Hierarchical models | |
hierarchical_model = AgglomerativeClustering(n_clusters=2).fit(X_scaled) | |
dbscan_model = DBSCAN(eps=0.5, min_samples=5).fit(X_scaled) | |
dbscan_core_samples = dbscan_model.components_ | |
dbscan_core_labels = dbscan_model.labels_[dbscan_model.core_sample_indices_] | |
def predict_cluster(username, password, algorithm, k, *features): | |
if username != "kavin" or password != "1234": | |
return "β Invalid login. Please try again." | |
features_scaled = scaler.transform([features]) | |
if algorithm == "KMeans": | |
cluster = kmeans_model.predict(features_scaled)[0] | |
cluster_type = cluster_meaning.get(cluster, "Unknown Cluster") | |
return f"β Cluster {cluster} β {cluster_type} (KMeans, k=2)" | |
elif algorithm == "Hierarchical": | |
new_data = np.vstack([X_scaled, features_scaled]) | |
labels = AgglomerativeClustering(n_clusters=2).fit_predict(new_data) | |
cluster = labels[-1] | |
cluster0_balance = data.loc[labels[:-1] == 0, "BALANCE"].mean() | |
cluster1_balance = data.loc[labels[:-1] == 1, "BALANCE"].mean() | |
high_spend_cluster = 0 if cluster0_balance > cluster1_balance else 1 | |
cluster_type = "High Spend Customer" if cluster == high_spend_cluster else "Low Spend Customer" | |
return f"β Cluster {cluster} β {cluster_type} (Hierarchical Clustering)" | |
elif algorithm == "DBSCAN": | |
dists = euclidean_distances(features_scaled, dbscan_core_samples) | |
nearest_idx = np.argmin(dists) | |
nearest_dist = dists[0, nearest_idx] | |
cluster = dbscan_core_labels[nearest_idx] | |
if nearest_dist <= dbscan_model.eps: | |
valid_clusters = [c for c in np.unique(dbscan_model.labels_) if c != -1] | |
cluster_balances = {} | |
for c in valid_clusters: | |
indices = np.where(dbscan_model.labels_ == c)[0] | |
cluster_balances[c] = data.iloc[indices]["BALANCE"].mean() | |
high_spend_cluster = max(cluster_balances, key=cluster_balances.get) | |
cluster_type = "High Spend Customer" if cluster == high_spend_cluster else "Low Spend Customer" | |
return f"β Cluster {cluster} β {cluster_type} (DBSCAN, dist={nearest_dist:.2f})" | |
else: | |
return "π¨ This data point is considered an OUTLIER (noise) by DBSCAN." | |
else: | |
return "β οΈ Please select a valid clustering algorithm." | |
with gr.Blocks() as demo: | |
with gr.Tab("π Login & Predict Cluster"): | |
gr.Markdown("## π Login and Select Clustering Method") | |
username = gr.Textbox(label="Username", placeholder="Enter username") | |
password = gr.Textbox(label="Password", type="password", placeholder="Enter password") | |
algorithm = gr.Dropdown( | |
["KMeans", "Hierarchical", "DBSCAN"], | |
label="Select Clustering Algorithm", | |
value="KMeans" | |
) | |
k_value = gr.Number(label="Number of Clusters (only for KMeans)", value=2) | |
inputs = [] | |
with gr.Accordion("π§ Enter Feature Values", open=True): | |
for feature in selected_features: | |
default_val = float(data[feature].median()) | |
inputs.append(gr.Number(label=feature, value=default_val)) | |
btn = gr.Button("π Predict Cluster") | |
output = gr.Textbox(label="Prediction Result", interactive=False) | |
btn.click( | |
fn=predict_cluster, | |
inputs=[username, password, algorithm, k_value] + inputs, | |
outputs=output | |
) | |
if __name__ == "__main__": | |
demo.launch() | |