In this post, we will go through an approach to get optimal/tuned model for the final prediction. First, we will see how to select best 'k' in kNN using simple python example. We will then jump to using sklearn
apis to explore different options for hyperparameter tuning.
For previous post, you can follow:
K-Nearest Neighbors Algorithm using Python and Scikit-Learn?
import os
import math
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# making results reproducible
np.random.seed(42)
df = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None, sep=',')
df.columns = ['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID', 'ASH', 'ALCALINITY','MAGNESIUM', 'PHENOLS',
'FLAVANOIDS', 'NON_FLAVANOID_PHENOL', 'PROANTHOCYANINS', 'COLOR_INTENSITY',
'HUE', 'OD280/OD315_DILUTED','PROLINE']
# Let us use only two features : 'ALCOHOL_LEVEL', 'MALIC_ACID' for this problem
df = df[['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID']]
df.head()
1. kNN and Cross validation using Python from Scratch¶
class KNN:
def __init__(self, K):
self.K = K
self.X_train = None
self.y_train = None
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict_instance(self, test_instance):
inputs = self.X_train.copy()
# calculate L2 norm between all training points and given test_point
inputs['distance'] = np.linalg.norm(inputs.values-test_instance.values, axis=1)
# concatenate inputs and labels before sorting the distances
inputs = pd.concat([inputs, self.y_train], axis=1)
# sort based on distance
inputs = inputs.sort_values('distance', ascending=True)
# pick k neighbors
neighbors = inputs.head(self.K)
# get list from dataframe column
classes = neighbors['CLASS'].tolist()
# create counter of labels
majority_count = Counter(classes)
return majority_count.most_common(1).pop()[0]
def predict(self, X_test):
predictions = np.zeros(X_test.shape[0])
# we want out index to be start from 0
X_test.reset_index(drop=True, inplace=True)
for index, row in X_test.iterrows():
predictions[index] = self.predict_instance(row)
return predictions
def cross_validation(n, k, data, n_neighbors):
"""
n : # iterations
k : k-fold size
data: training data
n_neighbors: k in knn
"""
accuracies = []
for _ in range(0, n):
# data shuffle
data.sample(frac=1)
fold=int(data.shape[0]/k)
for j in range(k):
test = data[j*fold:j*fold+fold]
train = data[~data.index.isin(test.index)]
X_train, y_train = train.drop('CLASS', axis=1), train['CLASS']
X_test, y_test = test.drop('CLASS', axis=1), test['CLASS']
knn = KNN(n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
true_values = y_test.to_numpy()
accuracy = np.mean(predictions == true_values)
accuracies.append(accuracy)
return np.array(accuracies).mean()
# We will be using following settings for all the cases below
k_values = np.arange(1, 16)
cross_validation_fold = 10
accuracies = []
2. Finding optimal k value for kNN¶
for k in k_values:
# run cross-validation with given neighbor size k
accuracy = cross_validation(1, cross_validation_fold, df, k)
accuracies.append(accuracy)
print(accuracies)
fig = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k in kNN')
plt.ylabel('CV-Accuracy')
fig.suptitle('kNN hyperparameter (k) tuning with python alone', fontsize=20)
We can see that k=9 seems a good choice for our dataset.
3. Finding optimal k value for kNN using sklearn¶
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# I wanted to see how many cpus are available in my machine so that i can run cv in parallel
os.cpu_count()
X, y = df.drop('CLASS', axis=1), df['CLASS']
accuracies = []
for k in k_values:
# instantiate kNN with given neighbor size k
knn = KNeighborsClassifier(n_neighbors=k)
# run cross validation for a given kNN setup
# I have setup n_jobs=-1 to use all cpus in my env.
scores = cross_val_score(knn, X, y, cv=cross_validation_fold, scoring='accuracy', n_jobs=-1)
accuracies.append(scores.mean())
print(accuracies)
fig2 = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k in kNN')
plt.ylabel('CV-Accuracy')
fig2.suptitle('kNN hyperparameter (k) tuning with sklearn', fontsize=20)
Usually, we have to deal with many hyperparameters for a model. In order to tune all of them at once, sklearn has provided a different API. Next, we will explore GridSearchCV
.
4. Tune many hyperparameters together using sklearn GridSearchCV API¶
We will be using distance metrices and k-neigbors for this case.
from sklearn.model_selection import GridSearchCV
metrics = ['euclidean','manhattan']
neighbors = np.arange(1, 16)
param_grid = dict(metric=metrics, n_neighbors=neighbors)
param_grid
# here 10-fold cross-validation is being executed for all the combinations
# total combinations will be : 15*2 = 30
# so in total 30 10-fold cross validatin will be run
knn = KNeighborsClassifier()
# when refit=True, it will fits the best hyperparameters to all training data
# and also allow to use GridSearchCV object as an estimator for prediction
grid_search = GridSearchCV(knn, param_grid, cv=cross_validation_fold, scoring='accuracy', refit=True)
grid_search.fit(X, y)
cross_val_df = pd.DataFrame(grid_search.cv_results_)
cross_val_df.head()
# since for both metric (manhatton/euclidean), we will have test score
# let's use euclidean for this case
accuracies = cross_val_df[cross_val_df["param_metric"]=='euclidean']["mean_test_score"]
accuracies
fig3 = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k in kNN')
plt.ylabel('CV-Accuracy')
fig3.suptitle('kNN hyperparameter (k) tuning with GridSearchCV', fontsize=20)
5. Following the standard Machine Learning pipeline¶
-
Until now, we are using all the data as our training set
-
Within cross validation, data is sampled for training set and also used for building validation set from this sample
-
Final validation score is used as a performance measure.
-
Now, we will also take out some data for the final testing (this data is not allowed to touch until last testing phase)
from sklearn.model_selection import train_test_split
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# TRAINING PHASE
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=cross_validation_fold, scoring='accuracy', refit=True)
grid_search.fit(X_train, y_train)
# since refit=True,we can directly use grid_search object above as our final best model or you can do as follow:
optimal_knn = grid_search.best_estimator_
optimal_knn
You can see that the optimal_knn has neighbor size = 9 and metric = 'euclidean'.¶
# TESTING PHASE
# accuracy on test data
optimal_knn.score(X_test, y_test)