Nearest Neighbors using L2 and L1 Distance

  20 Jul 2019
  python, machine learning


Preliminaries

import numpy as np

# Load data set and code labels as 0 = ’NO’, 1 = ’DH’, 2 = ’SL’
labels = [b'NO', b'DH', b'SL']
data = np.loadtxt('column_3C.dat', converters={6: lambda s: labels.index(s)} )

# Separate features from labels
x = data[:,0:6]
y = data[:,6]

# Divide into training and test set
training_indices = list(range(0,20)) + list(range(40,188)) + list(range(230,310))
test_indices = list(range(20,40)) + list(range(188,230))

trainx = x[training_indices,:]
trainy = y[training_indices]
testx = x[test_indices,:]
testy = y[test_indices]

Distance Metrics

  • L2 norm

distance_l2norm = np.sqrt(np.sum(np.square(a-b)))

a = np.array([1,2,3,4])
b = np.array([5,6,7,8])
print(a-b)
print(np.square(a-b))
print(np.sum(np.square(a-b)))
Output:
[-4 -4 -4 -4]
[16 16 16 16]
64

  • L1 norm

distance_l1norm = np.sum(np.abs(a-b))

print(a-b)
print(np.abs(a-b))
print(np.sum(np.abs(a-b)))
Output:
[-4 -4 -4 -4]
[4 4 4 4]
16

Nearest Neighbors

  • L2 norm as distance metric
def NN_L2(trainx, trainy, testx):
    testy_L2 = []
    for i in range(len(testx)):
        distance = [np.sum(np.square(testx[i]-trainx[j])) for j in range(len(trainx))]
        test_predicted = trainy[np.argmin(distance)]
        testy_L2.append(test_predicted)
    return np.asarray(testy_L2)
    
testy_L2 = NN_L2(trainx, trainy, testx)

  • L1 norm as distance metric
def NN_L1(trainx, trainy, testx):
    testy_L1 = []
    for i in range(len(testx)):
        distance = [np.sum(np.abs(testx[i]-trainx[j])) for j in range(len(trainx))]
        test_predicted = trainy[np.argmin(distance)]
        testy_L1.append(test_predicted)
    return np.asarray(testy_L1)
    
testy_L1 = NN_L1(trainx, trainy, testx)

Predictions

testy_L1 = NN_L1(trainx, trainy, testx)
testy_L2 = NN_L2(trainx, trainy, testx)

Error Rate

def error_rate(testy, testy_fit):
    return float(sum(testy!=testy_fit))/len(testy) 

print("Error rate of NN_L1: ", error_rate(testy,testy_L1) )
print("Error rate of NN_L2: ", error_rate(testy,testy_L2) )
Output:
Error rate of NN_L1:  0.22580645161290322
Error rate of NN_L2:  0.20967741935483872

Confusion Matrix

  • Using Pandas
import pandas as pd
def confusion_usingpandas(testy,testy_fit):
    return np.asarray(pd.crosstab(testy,testy_fit))
confusion_usingpandas(testy,testy_L2)
Output:
array([[17,  1,  2],
       [10, 10,  0],
       [ 0,  0, 22]], dtype=int64)
confusion_usingpandas(testy,testy_L1)
Output:
array([[16,  2,  2],
       [10, 10,  0],
       [ 0,  0, 22]], dtype=int64)

  • From Scratch
def confusion(testy,testy_fit):
    matrix_dimension = len(np.unique(testy))
    confusion = np.zeros((matrix_dimension,matrix_dimension))
    for i in range(len(testy)):
        confusion[int(testy[i])][int(testy_fit[i])] += 1
    return confusion
confusion(testy,testy_L2)
Output:
array([[17.,  1.,  2.],
       [10., 10.,  0.],
       [ 0.,  0., 22.]])
confusion(testy,testy_L1)
Output:
array([[16.,  2.,  2.],
       [10., 10.,  0.],
       [ 0.,  0., 22.]])

Jupyter Notebook Link - Nearest neighbor for spine injury classification