What is the code in python (sklearn) that changes k-means (which uses euclidean distance as a standard) to cosine distance as similarity measure?
Expert Answer
from sklearn.cluster import k_means_
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.preprocessing import StandardScaler
def generate_cluster(sparseData, numberOfClusters = 10):
# overriding euclidean manually
def euclidean_distance(M, N = None, Y_norm_squared = None, isSquare = False):
#pairwise distances here returning
return cosine_similarity(M, N)
k_means_.euclidean_distances = euclidean_distance
std_scaler = StandardScaler(with_mean=False)
sparseData = std_scaler.fit_transform(sparseData)
k_means = k_means_.KMeans(n_clusters = numberOfClusters, n_jobs = 20, random_state = 3425)
_ = k_means.fit(sparseData)
return k_means.labels_