For K means clustering algorithm, I will be using Credit Cards Dataset for Clustering
from Kaggle.
In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
Data preprocessing¶
In [136]:
credit_data = pd.read_csv('../data/CC GENERAL.csv')
In [137]:
credit_data.head()
Out[137]:
A. Check for missing data¶
In [138]:
credit_data.isna().sum()
Out[138]:
We can see that some missing values in column MINIMUM_PAYMENTS
column. Since we are focusing on algorithm aspect in this tutorial, I will simply remove entries having 'NaN' value.
B. Remove 'NaN' entries¶
In [139]:
credit_data = credit_data.dropna(how='any')
C. Remove nonrelevant column/feature¶
In [140]:
# Customer ID does not bear any meaning to build cluster. So, let's remove it.
credit_data.drop("CUST_ID", axis=1, inplace=True)
In [141]:
credit_data.values.shape
Out[141]:
K Means Implementation¶
In [142]:
class KMeans:
def __init__(self, data, K):
"""
:data: input data matrix, numpy array
:K: Number of clusters, int
"""
self.data = data
self.K = K
self.centroids = None
self.labels = None
self.input_size = self.data.shape
self.prepare_random_centroids()
def prepare_random_centroids(self):
"""
Randomly initialize initial K centroids. This defines the clusters.
"""
self.centroids = self.data[np.random.choice(self.input_size[0], self.K, replace=False)]
def assign_labels(self):
"""
Assign each input xi to nearest cluster.
"""
# prepare empty distance matrix : distance between each input and centroids
distance_matrix = np.zeros((self.input_size[0], self.K))
# Measure the distance between each input xi and each centroids
for i in range(self.K):
# for each row on matrix ->self.data compute distance to centroid.
distance_matrix[:, i] = np.linalg.norm(self.data - self.centroids[i], axis=1)
# Assign labels to input data based on minimum distance value
# Assigning labels means assigning centroid to each data point
self.labels = np.argmin(distance_matrix, axis=1)
def update_centroids(self):
"""
Based on labels, we update centroids / compute cluster centers based on mean of all inputs for a given cluster.
"""
for i in range(self.K):
# Let's calculate the mean for each cluster and update the centroids
# operation along vertical in matrix (axis=0)
self.centroids[i] = np.mean(self.data[self.labels == i], axis=0)
def train(self):
"""
Training of k-means until convergence.
"""
old_centroids = np.zeros(self.centroids.shape)
while not self.check_convergence(old_centroids, self.centroids):
# B[:]=A makes a copy and B=A creates a reference
old_centroids[:] = self.centroids
# label assignments
self.assign_labels()
# centroids update
self.update_centroids()
@staticmethod
def check_convergence(old_centroid, new_centroid):
"""
Checks if two centroids are same.
"""
return (old_centroid==new_centroid).all()
In [143]:
k_means = KMeans(credit_data.values, 10)
In [144]:
# let's see the initial random clusters before training
fig = plt.figure(figsize=(12, 12))
plt.xlabel('Balance', fontsize=9)
plt.ylabel('Purchase', fontsize=9)
plt.title('Random clusters')
plt.scatter(k_means.data[:, 0][:200], k_means.data[:, 2][:200])
plt.scatter(k_means.centroids[:, 0], k_means.centroids[:, 2], c='r', s=100)
plt.show()
In [145]:
k_means.train()
In [146]:
# Final cluster plot after training
fig = plt.figure(figsize=(12, 12))
plt.xlabel('Balance', fontsize=9)
plt.ylabel('Purchase', fontsize=9)
plt.title('Final clusters')
plt.scatter(k_means.data[:, 0][:50], k_means.data[:, 2][:50])
plt.scatter(k_means.centroids[:, 0], k_means.centroids[:, 2], c='r', s=100)
plt.show()