How to tune hyperparameter in K Nearest Neighbors Classifier?

In this post, we will go through an approach to get optimal/tuned model for the final prediction. First, we will see how to select best 'k' in kNN using simple python example. We will then jump to using sklearn apis to explore different options for hyperparameter tuning.

For previous post, you can follow:

How kNN works ?

K-Nearest Neighbors Algorithm using Python and Scikit-Learn?

Out of sample accuracy estimation using cv in knn

In [1]:
import os
import math
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# making results reproducible
np.random.seed(42)
In [2]:
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None, sep=',')

df.columns = ['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID', 'ASH', 'ALCALINITY','MAGNESIUM', 'PHENOLS', 
              'FLAVANOIDS', 'NON_FLAVANOID_PHENOL', 'PROANTHOCYANINS', 'COLOR_INTENSITY', 
              'HUE', 'OD280/OD315_DILUTED','PROLINE']

# Let us use only two features : 'ALCOHOL_LEVEL', 'MALIC_ACID' for this problem
df = df[['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID']]
df.head()
Out[2]:
CLASS ALCOHOL_LEVEL MALIC_ACID
0 1 14.23 1.71
1 1 13.20 1.78
2 1 13.16 2.36
3 1 14.37 1.95
4 1 13.24 2.59

1. kNN and Cross validation using Python from Scratch

In [3]:
class KNN:
    def __init__(self, K):
        self.K = K
        self.X_train = None
        self.y_train = None
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict_instance(self, test_instance):
        inputs = self.X_train.copy()
        # calculate L2 norm between all training points and given test_point
        inputs['distance'] = np.linalg.norm(inputs.values-test_instance.values, axis=1)
        
        # concatenate inputs and labels before sorting the distances
        inputs = pd.concat([inputs, self.y_train], axis=1)

        # sort based on distance
        inputs = inputs.sort_values('distance', ascending=True)

        # pick k neighbors
        neighbors = inputs.head(self.K)

        # get list from dataframe column
        classes = neighbors['CLASS'].tolist()

        # create counter of labels
        majority_count = Counter(classes)
        
        return majority_count.most_common(1).pop()[0]
        
        
    def predict(self, X_test):
        predictions = np.zeros(X_test.shape[0])
        # we want out index to be start from 0
        X_test.reset_index(drop=True, inplace=True)
        for index, row in X_test.iterrows():
            predictions[index] = self.predict_instance(row)
        return predictions

def cross_validation(n, k, data, n_neighbors):
    """
    n : # iterations
    k : k-fold size
    data: training data
    n_neighbors: k in knn
    """
    accuracies = []
    
    for _ in range(0, n):
        # data shuffle
        data.sample(frac=1)
        
        fold=int(data.shape[0]/k)
        
        for j in range(k):
            test = data[j*fold:j*fold+fold]
            train = data[~data.index.isin(test.index)]
            X_train, y_train = train.drop('CLASS', axis=1), train['CLASS']
            X_test, y_test = test.drop('CLASS', axis=1), test['CLASS']
            
            knn = KNN(n_neighbors)
            knn.fit(X_train, y_train)
            
            predictions = knn.predict(X_test)
            true_values = y_test.to_numpy()
            accuracy = np.mean(predictions == true_values)
            
            accuracies.append(accuracy)
    return np.array(accuracies).mean()
In [4]:
# We will be using following settings for all the cases below
k_values = np.arange(1, 16)
cross_validation_fold = 10
accuracies = []

2. Finding optimal k value for kNN

In [5]:
for k in k_values:
    # run cross-validation with given neighbor size k
    accuracy = cross_validation(1, cross_validation_fold, df, k)
    accuracies.append(accuracy)
print(accuracies)
[0.6411764705882353, 0.6647058823529411, 0.7470588235294118, 0.7529411764705881, 0.7588235294117647, 0.7529411764705882, 0.7529411764705881, 0.7294117647058822, 0.7823529411764707, 0.7647058823529412, 0.7705882352941177, 0.7764705882352941, 0.7705882352941176, 0.7529411764705881, 0.7411764705882352]
In [6]:
fig = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k in kNN')
plt.ylabel('CV-Accuracy')
fig.suptitle('kNN hyperparameter (k) tuning with python alone', fontsize=20)
Out[6]:
Text(0.5, 0.98, 'kNN hyperparameter (k) tuning with python alone')
No description has been provided for this image

We can see that k=9 seems a good choice for our dataset.

3. Finding optimal k value for kNN using sklearn

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
In [8]:
# I wanted to see how many cpus are available in my machine so that i can run cv in parallel
os.cpu_count()
Out[8]:
8
In [9]:
X, y = df.drop('CLASS', axis=1), df['CLASS']
accuracies = []
for k in k_values:
    # instantiate kNN with given neighbor size k
    knn = KNeighborsClassifier(n_neighbors=k)
    # run cross validation for a given kNN setup
    # I have setup n_jobs=-1 to use all cpus in my env.
    scores = cross_val_score(knn, X, y, cv=cross_validation_fold, scoring='accuracy', n_jobs=-1)
    accuracies.append(scores.mean())
print(accuracies)
[0.7452936876504987, 0.7136416408668731, 0.7789946680426557, 0.778953818369453, 0.7859391124871001, 0.798765909872721, 0.8151057791537667, 0.8151057791537667, 0.7928083075335397, 0.7973770209838321, 0.8105714654282765, 0.8043214654282765, 0.8053083075335398, 0.8108638630890953, 0.8167462160302718]
In [10]:
fig2 = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k in kNN')
plt.ylabel('CV-Accuracy')
fig2.suptitle('kNN hyperparameter (k) tuning with sklearn', fontsize=20)
Out[10]:
Text(0.5, 0.98, 'kNN hyperparameter (k) tuning with sklearn')
No description has been provided for this image

Usually, we have to deal with many hyperparameters for a model. In order to tune all of them at once, sklearn has provided a different API. Next, we will explore GridSearchCV.

4. Tune many hyperparameters together using sklearn GridSearchCV API

We will be using distance metrices and k-neigbors for this case.
In [11]:
from sklearn.model_selection import GridSearchCV
In [12]:
metrics = ['euclidean','manhattan'] 
neighbors = np.arange(1, 16)
param_grid  = dict(metric=metrics, n_neighbors=neighbors)
param_grid
Out[12]:
{'metric': ['euclidean', 'manhattan'],
 'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])}
In [13]:
# here 10-fold cross-validation is being executed for all the combinations
# total combinations will be : 15*2 = 30
# so in total 30 10-fold cross validatin will be run
knn = KNeighborsClassifier()
# when refit=True, it will fits the best hyperparameters to all training data
# and also allow to use GridSearchCV object as an estimator for prediction
grid_search = GridSearchCV(knn, param_grid, cv=cross_validation_fold, scoring='accuracy', refit=True)
grid_search.fit(X, y)
/home/lenovo/workspace/envs/envml/lib/python3.5/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Out[13]:
GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)
In [14]:
cross_val_df = pd.DataFrame(grid_search.cv_results_)
cross_val_df.head()
Out[14]:
mean_fit_time mean_score_time mean_test_score param_metric param_n_neighbors params rank_test_score split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score std_fit_time std_score_time std_test_score
0 0.002142 0.002116 0.747191 euclidean 1 {'n_neighbors': 1, 'metric': 'euclidean'} 28 0.736842 0.666667 0.833333 0.666667 0.777778 0.833333 0.777778 0.833333 0.764706 0.5625 0.000315 0.000213 0.082817
1 0.002033 0.002119 0.713483 euclidean 2 {'n_neighbors': 2, 'metric': 'euclidean'} 30 0.684211 0.611111 0.777778 0.666667 0.722222 0.777778 0.611111 0.833333 0.764706 0.6875 0.000164 0.000213 0.070995
2 0.001908 0.001944 0.780899 euclidean 3 {'n_neighbors': 3, 'metric': 'euclidean'} 22 0.736842 0.722222 0.888889 0.722222 0.833333 0.833333 0.888889 0.833333 0.705882 0.6250 0.000115 0.000084 0.082573
3 0.001834 0.001916 0.780899 euclidean 4 {'n_neighbors': 4, 'metric': 'euclidean'} 22 0.736842 0.666667 0.833333 0.722222 0.888889 0.833333 0.888889 0.833333 0.823529 0.5625 0.000063 0.000051 0.097615
4 0.001839 0.001939 0.786517 euclidean 5 {'n_neighbors': 5, 'metric': 'euclidean'} 21 0.736842 0.722222 0.888889 0.722222 0.888889 0.888889 0.777778 0.777778 0.705882 0.7500 0.000107 0.000093 0.070958
In [15]:
# since for both metric (manhatton/euclidean), we will have test score
# let's use euclidean for this case
accuracies = cross_val_df[cross_val_df["param_metric"]=='euclidean']["mean_test_score"]
accuracies
Out[15]:
0     0.747191
1     0.713483
2     0.780899
3     0.780899
4     0.786517
5     0.797753
6     0.814607
7     0.814607
8     0.792135
9     0.797753
10    0.808989
11    0.803371
12    0.803371
13    0.808989
14    0.814607
Name: mean_test_score, dtype: float64
In [16]:
fig3 = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k in kNN')
plt.ylabel('CV-Accuracy')
fig3.suptitle('kNN hyperparameter (k) tuning with GridSearchCV', fontsize=20)
Out[16]:
Text(0.5, 0.98, 'kNN hyperparameter (k) tuning with GridSearchCV')
No description has been provided for this image

5. Following the standard Machine Learning pipeline

  • Until now, we are using all the data as our training set

  • Within cross validation, data is sampled for training set and also used for building validation set from this sample

  • Final validation score is used as a performance measure.

  • Now, we will also take out some data for the final testing (this data is not allowed to touch until last testing phase)

In [17]:
from sklearn.model_selection import train_test_split
In [18]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)
In [19]:
# TRAINING PHASE
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=cross_validation_fold, scoring='accuracy', refit=True)
grid_search.fit(X_train, y_train)
/home/lenovo/workspace/envs/envml/lib/python3.5/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Out[19]:
GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)
In [20]:
# since refit=True,we can directly use grid_search object above as our final best model or you can do as follow:
optimal_knn = grid_search.best_estimator_
optimal_knn
Out[20]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')
You can see that the optimal_knn has neighbor size = 9 and metric = 'euclidean'.
In [21]:
# TESTING PHASE
# accuracy on test data
optimal_knn.score(X_test, y_test)
Out[21]:
0.797752808988764
In [ ]:
 

Comments

Comments powered by Disqus