For K means clustering algorithm, I will be using Credit Cards Dataset for Clustering from Kaggle.

In [135]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

Data preprocessing¶

In [136]:

credit_data = pd.read_csv('../data/CC GENERAL.csv')

In [137]:

credit_data.head()

Out[137]:

	CUST_ID	BALANCE	BALANCE_FREQUENCY	PURCHASES	ONEOFF_PURCHASES	INSTALLMENTS_PURCHASES	CASH_ADVANCE	PURCHASES_FREQUENCY	ONEOFF_PURCHASES_FREQUENCY	PURCHASES_INSTALLMENTS_FREQUENCY	CASH_ADVANCE_FREQUENCY	CASH_ADVANCE_TRX	PURCHASES_TRX	CREDIT_LIMIT	PAYMENTS	MINIMUM_PAYMENTS	PRC_FULL_PAYMENT	TENURE
0	C10001	40.900749	0.818182	95.40	0.00	95.4	0.000000	0.166667	0.000000	0.083333	0.000000	0	2	1000.0	201.802084	139.509787	0.000000	12
1	C10002	3202.467416	0.909091	0.00	0.00	0.0	6442.945483	0.000000	0.000000	0.000000	0.250000	4	0	7000.0	4103.032597	1072.340217	0.222222	12
2	C10003	2495.148862	1.000000	773.17	773.17	0.0	0.000000	1.000000	1.000000	0.000000	0.000000	0	12	7500.0	622.066742	627.284787	0.000000	12
3	C10004	1666.670542	0.636364	1499.00	1499.00	0.0	205.788017	0.083333	0.083333	0.000000	0.083333	1	1	7500.0	0.000000	NaN	0.000000	12
4	C10005	817.714335	1.000000	16.00	16.00	0.0	0.000000	0.083333	0.083333	0.000000	0.000000	0	1	1200.0	678.334763	244.791237	0.000000	12

A. Check for missing data¶

In [138]:

credit_data.isna().sum()

Out[138]:

CUST_ID                               0
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64

We can see that some missing values in column MINIMUM_PAYMENTS column. Since we are focusing on algorithm aspect in this tutorial, I will simply remove entries having 'NaN' value.

B. Remove 'NaN' entries¶

In [139]:

credit_data = credit_data.dropna(how='any')

C. Remove nonrelevant column/feature¶

In [140]:

# Customer ID does not bear any meaning to build cluster. So, let's remove it.
credit_data.drop("CUST_ID", axis=1, inplace=True)

In [141]:

credit_data.values.shape

Out[141]:

(8636, 17)

K Means Implementation¶

In [142]:

class KMeans:
    def __init__(self, data, K):
        """
        :data: input data matrix, numpy array
        :K: Number of clusters, int
        """
        self.data = data
        self.K = K
        self.centroids = None
        self.labels = None
        
        self.input_size = self.data.shape
        
        self.prepare_random_centroids()

    def prepare_random_centroids(self):
        """
        Randomly initialize initial K centroids. This defines the clusters.
        """
        self.centroids = self.data[np.random.choice(self.input_size[0], self.K, replace=False)]

    def assign_labels(self):
        """
        Assign each input xi to nearest cluster.
        """
        # prepare empty distance matrix : distance between each input and centroids
        distance_matrix = np.zeros((self.input_size[0], self.K))
        # Measure the distance between each input xi and each centroids
        for i in range(self.K):
            # for each row on matrix ->self.data compute distance to centroid.
            distance_matrix[:, i] = np.linalg.norm(self.data - self.centroids[i], axis=1)
        # Assign labels to input data based on minimum distance value
        # Assigning labels means assigning centroid to each data point
       
        self.labels = np.argmin(distance_matrix, axis=1)
    
    def update_centroids(self):
        """
        Based on labels, we update centroids / compute cluster centers based on mean of all inputs for a given cluster.
        """
        for i in range(self.K):
            # Let's calculate the mean for each cluster and update the centroids
            # operation along vertical in matrix (axis=0)
            self.centroids[i] = np.mean(self.data[self.labels == i], axis=0)
    
    def train(self):
        """
        Training of k-means until convergence.
        """
        old_centroids = np.zeros(self.centroids.shape)
        while not self.check_convergence(old_centroids, self.centroids):
            # B[:]=A makes a copy and B=A creates a reference
            old_centroids[:] = self.centroids
            # label assignments
            self.assign_labels()
            
            # centroids update
            self.update_centroids()
        
    @staticmethod
    def check_convergence(old_centroid, new_centroid):
        """
        Checks if two centroids are same.
        """
        return (old_centroid==new_centroid).all()

In [143]:

k_means = KMeans(credit_data.values, 10)

In [144]:

# let's see the initial random clusters before training 
fig = plt.figure(figsize=(12, 12))
plt.xlabel('Balance', fontsize=9)
plt.ylabel('Purchase', fontsize=9)
plt.title('Random clusters')
plt.scatter(k_means.data[:, 0][:200], k_means.data[:, 2][:200])
plt.scatter(k_means.centroids[:, 0], k_means.centroids[:, 2], c='r', s=100)
plt.show()

No description has been provided for this image

In [145]:

k_means.train()

In [146]:

# Final cluster plot after training
fig = plt.figure(figsize=(12, 12))
plt.xlabel('Balance', fontsize=9)
plt.ylabel('Purchase', fontsize=9)
plt.title('Final clusters')
plt.scatter(k_means.data[:, 0][:50], k_means.data[:, 2][:50])

plt.scatter(k_means.centroids[:, 0], k_means.centroids[:, 2], c='r', s=100)
plt.show()

Menu

Implementation of K means clustering algorithm in Python

Data preprocessing¶

A. Check for missing data¶

B. Remove 'NaN' entries¶

C. Remove nonrelevant column/feature¶

K Means Implementation¶