# Question & Answer: I need to run k-means in python using using random initialisation (not k-mea…..

I need to run k-means in python using using random initialisation (not k-means++ initialisation that is the standard in sklearn). How do i change the code to reflect this?

Python code: Actually this code produces nearly 5 plots based on the steps. The last plot consists of final kmeans clustering data.

source code:

## Step 1: Initialisation

import pandas as pand
import numpy as npVal
import matplotlib.pyplot as plot

df = pand.Data({
‘x’: [12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69, 72],
‘y’: [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24]
})

npVal.random.seed(200)
k = 3
centriodVal = {
i+1: [npVal.random.randint(0, 80), npVal.random.randint(0, 80)]
for i in range(k)
}

fig = plot.figure(figsize=(5, 5))
plot.scatter(df[‘x’], df[‘y’], color=’k’)
colmap = {1: ‘r’, 2: ‘g’, 3: ‘b’}
for i in centriodVal.keys():
plot.scatter(*centriodVal[i], color=colmap[i])
plot.xlim(0, 80)
plot.ylim(0, 80)
plot.show()

## Step 2: Assigning values

def assignValues(df, centriodVal):
for i in centriodVal.keys():

df[‘distance_from_{}’.format(i)] = (
npVal.sqrt(
(df[‘x’] – centriodVal[i][0]) ** 2
+ (df[‘y’] – centriodVal[i][1]) ** 2
)
)
centriodDistColumn = [‘distance_from_{}’.format(i) for i in centriodVal.keys()]
df[‘closest’] = df.loc[:, centriodDistColumn].idxmin(axis=1)
df[‘closest’] = df[‘closest’].map(lambda x: int(x.lstrip(‘distance_from_’)))
df[‘color’] = df[‘closest’].map(lambda x: colmap[x])
return df

df = assignValues(df, centriodVal)

fig = plot.figure(figsize=(5, 5))
plot.scatter(df[‘x’], df[‘y’], color=df[‘color’], alpha=0.5, edgecolor=’k’)
for i in centriodVal.keys():
plot.scatter(*centriodVal[i], color=colmap[i])
plot.xlim(0, 80)
plot.ylim(0, 80)
plot.show()

## Step 3: Update the centriod values

import copy

old_centriodVal = copy.deepcopy(centriodVal)

def update(k):
for i in centriodVal.keys():
centriodVal[i][0] = npVal.mean(df[df[‘closest’] == i][‘x’])
centriodVal[i][1] = npVal.mean(df[df[‘closest’] == i][‘y’])
return k

centriodVal = update(centriodVal)

fig = plot.figure(figsize=(5, 5))
ax = plot.axes()
plot.scatter(df[‘x’], df[‘y’], color=df[‘color’], alpha=0.5, edgecolor=’k’)
for i in centriodVal.keys():
plot.scatter(*centriodVal[i], color=colmap[i])
plot.xlim(0, 80)
plot.ylim(0, 80)
for i in old_centriodVal.keys():
old_x = old_centriodVal[i][0]
old_y = old_centriodVal[i][1]
dx = (centriodVal[i][0] – old_centriodVal[i][0]) * 0.75
dy = (centriodVal[i][1] – old_centriodVal[i][1]) * 0.75
ax.arrow(old_x, old_y, dx, dy, head_width=2, head_length=3, fc=colmap[i], ec=colmap[i])
plot.show()

## Continue assigning the values.

df = assignValues(df, centriodVal)

# Plotting the results based on the values
fig = plot.figure(figsize=(5, 5))
plot.scatter(df[‘x’], df[‘y’], color=df[‘color’], alpha=0.5, edgecolor=’k’)
for i in centriodVal.keys():
plot.scatter(*centriodVal[i], color=colmap[i])
plot.xlim(0, 80)
plot.ylim(0, 80)
plot.show()

# Continue until all assigned categories don’t change any more
while True:
closestCentriodVal = df[‘closest’].copy(deep=True)
centriodVal = update(centriodVal)
df = assignValues(df, centriodVal)
if closestCentriodVal.equals(df[‘closest’]):
break

fig = plot.figure(figsize=(5, 5))
plot.scatter(df[‘x’], df[‘y’], color=df[‘color’], alpha=0.5, edgecolor=’k’)
for i in centriodVal.keys():
plot.scatter(*centriodVal[i], color=colmap[i])
plot.xlim(0, 80)
plot.ylim(0, 80)
plot.show()

Output screenshot: