from io import BytesIO import numpy as np import pandas as pd import plotly.express as px import streamlit as st from hilbertcurve.hilbertcurve import HilbertCurve from sklearn.cluster import KMeans def cluster_sites_hilbert_curve_same_size( df: pd.DataFrame, lat_col: str, lon_col: str, region_col: str, max_sites: int = 25, mix_regions: bool = False, ): clusters = [] cluster_id = 0 if not mix_regions: grouped = df.groupby(region_col) else: grouped = [("All", df)] # Create Hilbert Curve (higher p = more precision) p = 16 # Adjust based on your coordinate precision needs hilbert_curve = HilbertCurve(p, 2) # 2D curve for region, group in grouped: if len(group) == 0: continue # Normalize coordinates to [0, 2^p-1] range lat_min, lat_max = group[lat_col].min(), group[lat_col].max() lon_min, lon_max = group[lon_col].min(), group[lon_col].max() group = group.copy() group["x"] = ((group[lat_col] - lat_min) / (lat_max - lat_min + 1e-10)) * ( 2**p - 1 ) group["y"] = ((group[lon_col] - lon_min) / (lon_max - lon_min + 1e-10)) * ( 2**p - 1 ) # Calculate Hilbert distance group["hilbert"] = group.apply( lambda row: hilbert_curve.distance_from_point( [int(row["x"]), int(row["y"])] ), axis=1, ) # Sort by Hilbert value group = group.sort_values("hilbert") # Create fixed-size clusters for i in range(0, len(group), max_sites): cluster = group.iloc[i : i + max_sites].copy() cluster["Cluster"] = f"C{cluster_id}" clusters.append(cluster) cluster_id += 1 result = pd.concat(clusters) return result.drop(columns=["x", "y", "hilbert"], errors="ignore") def cluster_sites_kmeans_lower_to_fixed_size( df: pd.DataFrame, lat_col: str, lon_col: str, region_col: str, max_sites: int = 25, mix_regions: bool = False, ): clusters = [] cluster_id = 0 if not mix_regions: grouped = df.groupby(region_col) else: grouped = [("All", df)] for region, group in grouped: coords = group[[lat_col, lon_col]].to_numpy() remaining_sites = group.copy() while len(remaining_sites) > 0: # Calculate number of clusters needed for remaining sites n_remaining = len(remaining_sites) n_clusters = max(1, int(np.ceil(n_remaining / max_sites))) if n_remaining <= max_sites: # If remaining sites can fit in one cluster cluster_group = remaining_sites.copy() cluster_group["Cluster"] = f"C{cluster_id}" clusters.append(cluster_group) cluster_id += 1 break else: # Apply KMeans to remaining sites kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) labels = kmeans.fit_predict( remaining_sites[[lat_col, lon_col]].to_numpy() ) # Split into clusters and check sizes temp_df = remaining_sites.copy() temp_df["Cluster"] = labels temp_df["Temp_Cluster"] = labels for cluster_num in range(n_clusters): cluster_group = temp_df[temp_df["Temp_Cluster"] == cluster_num] if len(cluster_group) <= max_sites: # If cluster is small enough, keep it cluster_group = cluster_group.drop(columns=["Temp_Cluster"]) cluster_group["Cluster"] = f"C{cluster_id}" clusters.append(cluster_group) cluster_id += 1 # Remove these sites from remaining_sites remaining_sites = remaining_sites.drop(cluster_group.index) # Else these sites will remain for next iteration return pd.concat(clusters) def to_excel(df: pd.DataFrame) -> bytes: output = BytesIO() with pd.ExcelWriter(output, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="Clusters") return output.getvalue() st.title("Automatic Site Clustering App") # Add description st.write( """This app allows you to cluster sites based on their latitude and longitude. **Please choose a file containing the latitude and longitude region and site code columns.** """ ) # Download Sample file clustering_sample_file_path = "samples/Site_Clustering.xlsx" # Create a download button st.download_button( label="Download Clustering Sample File", data=open(clustering_sample_file_path, "rb").read(), file_name="Site_Clustering.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) uploaded_file = st.file_uploader("Upload your Excel file ", type=["xlsx"]) if uploaded_file: df = pd.read_excel(uploaded_file) st.write("Sample of uploaded data:", df.head()) columns = df.columns.tolist() with st.form("clustering_form"): lat_col = st.selectbox("Select Latitude column", columns) lon_col = st.selectbox("Select Longitude column", columns) region_col = st.selectbox("Select Region column", columns) code_col = st.selectbox("Select Site Code column", columns) max_sites = st.number_input( "Max sites per cluster", min_value=5, max_value=100, value=25 ) cluster_method = st.selectbox( "Select clustering method", [ "Uniform number of sites for each cluster", # Hilbert Curve "Number of sites Lower than max but not uniform", # KMeans ], ) mix_regions = st.checkbox( "Allow mixing different regions in clusters", value=False ) submitted = st.form_submit_button("Run Clustering") if submitted: if cluster_method == "Uniform number of sites for each cluster": clustered_df = cluster_sites_hilbert_curve_same_size( df, lat_col, lon_col, region_col, max_sites, mix_regions ) elif cluster_method == "Number of sites Lower than max but not uniform": clustered_df = cluster_sites_kmeans_lower_to_fixed_size( df, lat_col, lon_col, region_col, max_sites, mix_regions ) st.success("Clustering completed!") # Show cluster size per cluster plot cluster_size = clustered_df["Cluster"].value_counts().sort_index() fig = px.bar(cluster_size, x=cluster_size.index, y=cluster_size.values) fig.update_layout(title="Cluster Size") st.plotly_chart(fig) # Show cluster size per region plot cluster_size_per_region = ( clustered_df.groupby([region_col, "Cluster"]) .size() .reset_index(name="count") ) fig = px.bar(cluster_size_per_region, x="Cluster", y="count", color=region_col) fig.update_layout(title="Cluster Size per Region") st.plotly_chart(fig) # Map Plot clustered_df["size"] = 10 fig = px.scatter_map( clustered_df, lat=lat_col, lon=lon_col, color="Cluster", size="size", hover_name=code_col, hover_data=[region_col], zoom=5, height=600, ) fig.update_layout(mapbox_style="open-street-map") fig.update_traces(marker=dict(size=15)) st.plotly_chart(fig) # Download button st.download_button( label="Download clustered Excel file", data=to_excel(clustered_df), file_name="clustered_sites.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", on_click="ignore", type="primary", )