In this post, we will be continuing from our previous post:

K-Nearest Neighbors Algorithm using Python and Scikit-Learn?

Before starting with the implementation, let's discuss few important points in cross validation.

1. Using Cross validation (CV), we splits our dataset into k folds (k generally setup by developer)
2. Once you created k folds, you use each of the folds as test set during run and all remaining folds as train set.
3. With cross validation, one can assess the average model performance (this post) or also for the hyperparameters selection (for example : selecting optimal neighbors size(k) in kNN) or selecting good feature combinations from given data features.
In :
import math
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# making results reproducible
np.random.seed(42)

In :
df = pd.read_csv(

df.columns = ['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID', 'ASH', 'ALCALINITY','MAGNESIUM', 'PHENOLS',
'FLAVANOIDS', 'NON_FLAVANOID_PHENOL', 'PROANTHOCYANINS', 'COLOR_INTENSITY',
'HUE', 'OD280/OD315_DILUTED','PROLINE']

# Let us use only two features : 'ALCOHOL_LEVEL', 'MALIC_ACID' for this problem
df = df[['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID']]

Out:
CLASS ALCOHOL_LEVEL MALIC_ACID
0 1 14.23 1.71
1 1 13.20 1.78
2 1 13.16 2.36
3 1 14.37 1.95
4 1 13.24 2.59

#### 1. Cross validation using Python from Scratch¶

In :
class KNN:
def __init__(self, K):
self.K = K
self.X_train = None
self.y_train = None

def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train

def predict_instance(self, test_instance):
inputs = self.X_train.copy()
# calculate L2 norm between all training points and given test_point
inputs['distance'] = np.linalg.norm(inputs.values-test_instance.values, axis=1)

# concatenate inputs and labels before sorting the distances
inputs = pd.concat([inputs, self.y_train], axis=1)

# sort based on distance
inputs = inputs.sort_values('distance', ascending=True)

# pick k neighbors

# get list from dataframe column
classes = neighbors['CLASS'].tolist()

# create counter of labels
majority_count = Counter(classes)

return majority_count.most_common(1).pop()

def predict(self, X_test):
predictions = np.zeros(X_test.shape)
# we want out index to be start from 0
X_test.reset_index(drop=True, inplace=True)
for index, row in X_test.iterrows():
predictions[index] = self.predict_instance(row)
return predictions

def cross_validation(n, k, data, n_neighbors):
"""
n : # iterations
k : k-fold size
data: training data
n_neighbors: k in knn
"""
accuracies = []

for _ in range(0, n):
# data shuffle
data.sample(frac=1)

fold=int(data.shape/k)

for j in range(k):
test = data[j*fold:j*fold+fold]
train = data[~data.index.isin(test.index)]
X_train, y_train = train.drop('CLASS', axis=1), train['CLASS']
X_test, y_test = test.drop('CLASS', axis=1), test['CLASS']

knn = KNN(n_neighbors)
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)
true_values = y_test.to_numpy()
accuracy = np.mean(predictions == true_values)

accuracies.append(accuracy)
return sum(accuracies)/len(accuracies)

In :
cross_validation(1, 10, df, 5)

Out:
0.7588235294117647

#### 2 Cross validation using Scikit-Learn¶

In :
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In :
knn_sklearn = KNeighborsClassifier(n_neighbors=5)

X, y = df.drop('CLASS', axis=1), df['CLASS']

scores = cross_val_score(knn_sklearn, X, y, cv=10, scoring='accuracy')
scores

Out:
array([0.73684211, 0.72222222, 0.88888889, 0.72222222, 0.88888889,
0.88888889, 0.77777778, 0.77777778, 0.70588235, 0.75      ])
In :
# We use average accuracy as an estimate of out-of-sample accuracy
scores.mean()

Out:
0.7859391124871001

#### 3. Mix of python and sklearn¶

In :
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In :
kf = KFold(10, shuffle=True, random_state=1)
X, y = df.drop('CLASS', axis=1), df['CLASS']
accuracies = []
for train_idx, test_idx in kf.split(X, y):
X_train, X_test  = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

knn = KNN(5)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

true_values = y_test.to_numpy()
accuracies.append(accuracy_score(true_values, predictions))
sum(accuracies)/len(accuracies)

Out:
0.7983660130718955
In [ ]: