from sklearn.cluster import AgglomerativeClustering import numpy as np def cluster_aspect_terms(nlp, aspects): aspect_terms = sorted(list(set(aspects['aspect'].values))) aspect_terms_sizes = aspects.groupby('aspect').size().sort_index().values aspect_terms_vectors = [doc.vector for doc in nlp.pipe(aspect_terms)] clusterer = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='average', distance_threshold=0.2) clusterer.fit(aspect_terms_vectors) term_replacements = {} for cluster in range(clusterer.n_clusters_): idxs = np.nonzero(clusterer.labels_ == cluster)[0] terms = [t for i, t in enumerate(aspect_terms) if i in idxs] sizes = aspect_terms_sizes[idxs] main_term = terms[np.argmax(sizes)] for term in terms: term_replacements[term] = main_term return term_replacements