In this post, we will be continuing from our previous post:
K-Nearest Neighbors Algorithm using Python and Scikit-Learn?
Before starting with the implementation, let's discuss few important points in cross validation.
- Using Cross validation (CV), we splits our dataset into k folds (k generally setup by developer)
- Once you created k folds, you use each of the folds as test set during run and all remaining folds as train set.
- With cross validation, one can assess the average model performance (this post) or also for the hyperparameters selection (for example : selecting optimal neighbors size(k) in kNN) or selecting good feature combinations from given data features.
In [1]:
import math
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# making results reproducible
np.random.seed(42)
In [2]:
df = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None, sep=',')
df.columns = ['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID', 'ASH', 'ALCALINITY','MAGNESIUM', 'PHENOLS',
'FLAVANOIDS', 'NON_FLAVANOID_PHENOL', 'PROANTHOCYANINS', 'COLOR_INTENSITY',
'HUE', 'OD280/OD315_DILUTED','PROLINE']
# Let us use only two features : 'ALCOHOL_LEVEL', 'MALIC_ACID' for this problem
df = df[['CLASS', 'ALCOHOL_LEVEL', 'MALIC_ACID']]
df.head()
Out[2]:
1. Cross validation using Python from Scratch¶
In [3]:
class KNN:
def __init__(self, K):
self.K = K
self.X_train = None
self.y_train = None
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict_instance(self, test_instance):
inputs = self.X_train.copy()
# calculate L2 norm between all training points and given test_point
inputs['distance'] = np.linalg.norm(inputs.values-test_instance.values, axis=1)
# concatenate inputs and labels before sorting the distances
inputs = pd.concat([inputs, self.y_train], axis=1)
# sort based on distance
inputs = inputs.sort_values('distance', ascending=True)
# pick k neighbors
neighbors = inputs.head(self.K)
# get list from dataframe column
classes = neighbors['CLASS'].tolist()
# create counter of labels
majority_count = Counter(classes)
return majority_count.most_common(1).pop()[0]
def predict(self, X_test):
predictions = np.zeros(X_test.shape[0])
# we want out index to be start from 0
X_test.reset_index(drop=True, inplace=True)
for index, row in X_test.iterrows():
predictions[index] = self.predict_instance(row)
return predictions
def cross_validation(n, k, data, n_neighbors):
"""
n : # iterations
k : k-fold size
data: training data
n_neighbors: k in knn
"""
accuracies = []
for _ in range(0, n):
# data shuffle
data.sample(frac=1)
fold=int(data.shape[0]/k)
for j in range(k):
test = data[j*fold:j*fold+fold]
train = data[~data.index.isin(test.index)]
X_train, y_train = train.drop('CLASS', axis=1), train['CLASS']
X_test, y_test = test.drop('CLASS', axis=1), test['CLASS']
knn = KNN(n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
true_values = y_test.to_numpy()
accuracy = np.mean(predictions == true_values)
accuracies.append(accuracy)
return sum(accuracies)/len(accuracies)
In [4]:
cross_validation(1, 10, df, 5)
Out[4]:
2 Cross validation using Scikit-Learn¶
In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
In [15]:
knn_sklearn = KNeighborsClassifier(n_neighbors=5)
X, y = df.drop('CLASS', axis=1), df['CLASS']
scores = cross_val_score(knn_sklearn, X, y, cv=10, scoring='accuracy')
scores
Out[15]:
In [17]:
# We use average accuracy as an estimate of out-of-sample accuracy
scores.mean()
Out[17]:
3. Mix of python and sklearn¶
In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
In [20]:
kf = KFold(10, shuffle=True, random_state=1)
X, y = df.drop('CLASS', axis=1), df['CLASS']
accuracies = []
for train_idx, test_idx in kf.split(X, y):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
knn = KNN(5)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
true_values = y_test.to_numpy()
accuracies.append(accuracy_score(true_values, predictions))
sum(accuracies)/len(accuracies)
Out[20]:
In [ ]: