Skip to content

Commit 537fcec

Browse files
author
louwill
authored
Add files via upload
k nearest neighbor
1 parent 25b5b52 commit 537fcec

File tree

2 files changed

+1057
-0
lines changed

2 files changed

+1057
-0
lines changed
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
import numpy as np
2+
# from past.builtins import xrange
3+
from collections import Counter
4+
5+
6+
class KNearestNeighbor(object):
7+
""" a kNN classifier with L2 distance """
8+
9+
def __init__(self):
10+
pass
11+
12+
def train(self, X, y):
13+
"""
14+
Train the classifier. For k-nearest neighbors this is just
15+
memorizing the training data.
16+
17+
Inputs:
18+
- X: A numpy array of shape (num_train, D) containing the training data
19+
consisting of num_train samples each of dimension D.
20+
- y: A numpy array of shape (N,) containing the training labels, where
21+
y[i] is the label for X[i].
22+
"""
23+
self.X_train = X
24+
self.y_train = y
25+
26+
def predict(self, X, k=1, num_loops=0):
27+
"""
28+
Predict labels for test data using this classifier.
29+
30+
Inputs:
31+
- X: A numpy array of shape (num_test, D) containing test data consisting
32+
of num_test samples each of dimension D.
33+
- k: The number of nearest neighbors that vote for the predicted labels.
34+
- num_loops: Determines which implementation to use to compute distances
35+
between training points and testing points.
36+
37+
Returns:
38+
- y: A numpy array of shape (num_test,) containing predicted labels for the
39+
test data, where y[i] is the predicted label for the test point X[i].
40+
"""
41+
if num_loops == 0:
42+
dists = self.compute_distances_no_loops(X)
43+
elif num_loops == 1:
44+
dists = self.compute_distances_one_loop(X)
45+
elif num_loops == 2:
46+
dists = self.compute_distances_two_loops(X)
47+
else:
48+
raise ValueError('Invalid value %d for num_loops' % num_loops)
49+
50+
return self.predict_labels(dists, k=k)
51+
52+
def compute_distances_two_loops(self, X):
53+
"""
54+
Compute the distance between each test point in X and each training point
55+
in self.X_train using a nested loop over both the training data and the
56+
test data.
57+
58+
Inputs:
59+
- X: A numpy array of shape (num_test, D) containing test data.
60+
61+
Returns:
62+
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
63+
is the Euclidean distance between the ith test point and the jth training
64+
point.
65+
"""
66+
num_test = X.shape[0]
67+
num_train = self.X_train.shape[0]
68+
dists = np.zeros((num_test, num_train))
69+
for i in range(num_test):
70+
for j in range(num_train):
71+
#####################################################################
72+
# TODO: #
73+
# Compute the l2 distance between the ith test point and the jth #
74+
# training point, and store the result in dists[i, j]. You should #
75+
# not use a loop over dimension. #
76+
#####################################################################
77+
dists[i, j] = np.linalg.norm(self.X_train[j, :] - X[i, :])
78+
#####################################################################
79+
# END OF YOUR CODE #
80+
#####################################################################
81+
return dists
82+
83+
def compute_distances_one_loop(self, X):
84+
"""
85+
Compute the distance between each test point in X and each training point
86+
in self.X_train using a single loop over the test data.
87+
88+
Input / Output: Same as compute_distances_two_loops
89+
"""
90+
num_test = X.shape[0]
91+
num_train = self.X_train.shape[0]
92+
dists = np.zeros((num_test, num_train))
93+
for i in range(num_test):
94+
#######################################################################
95+
# TODO: #
96+
# Compute the l2 distance between the ith test point and all training #
97+
# points, and store the result in dists[i, :]. #
98+
#######################################################################
99+
dists[i, :] = np.linalg.norm(self.X_train - X[i, :], axis=1)
100+
#######################################################################
101+
# END OF YOUR CODE #
102+
#######################################################################
103+
return dists
104+
105+
def compute_distances_no_loops(self, X):
106+
"""
107+
Compute the distance between each test point in X and each training point
108+
in self.X_train using no explicit loops.
109+
110+
Input / Output: Same as compute_distances_two_loops
111+
"""
112+
num_test = X.shape[0]
113+
num_train = self.X_train.shape[0]
114+
dists = np.zeros((num_test, num_train))
115+
#########################################################################
116+
# TODO: #
117+
# Compute the l2 distance between all test points and all training #
118+
# points without using any explicit loops, and store the result in #
119+
# dists. #
120+
# #
121+
# You should implement this function using only basic array operations; #
122+
# in particular you should not use functions from scipy. #
123+
# #
124+
# HINT: Try to formulate the l2 distance using matrix multiplication #
125+
# and two broadcast sums. #
126+
#########################################################################
127+
M = np.dot(X, self.X_train.T)
128+
te = np.square(X).sum(axis=1)
129+
tr = np.square(self.X_train).sum(axis=1)
130+
dists = np.sqrt(-2 * M + tr + np.matrix(te).T)
131+
#########################################################################
132+
# END OF YOUR CODE #
133+
#########################################################################
134+
return dists
135+
136+
def predict_labels(self, dists, k=1):
137+
"""
138+
Given a matrix of distances between test points and training points,
139+
predict a label for each test point.
140+
141+
Inputs:
142+
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
143+
gives the distance betwen the ith test point and the jth training point.
144+
145+
Returns:
146+
- y: A numpy array of shape (num_test,) containing predicted labels for the
147+
test data, where y[i] is the predicted label for the test point X[i].
148+
"""
149+
num_test = dists.shape[0]
150+
y_pred = np.zeros(num_test)
151+
for i in range(num_test):
152+
# A list of length k storing the labels of the k nearest neighbors to
153+
# the ith test point.
154+
closest_y = []
155+
#########################################################################
156+
# TODO: #
157+
# Use the distance matrix to find the k nearest neighbors of the ith #
158+
# testing point, and use self.y_train to find the labels of these #
159+
# neighbors. Store these labels in closest_y. #
160+
# Hint: Look up the function numpy.argsort. #
161+
#########################################################################
162+
labels = self.y_train[np.argsort(dists[i, :])].flatten()
163+
closest_y = labels[0:k]
164+
#########################################################################
165+
# TODO: #
166+
# Now that you have found the labels of the k nearest neighbors, you #
167+
# need to find the most common label in the list closest_y of labels. #
168+
# Store this label in y_pred[i]. Break ties by choosing the smaller #
169+
# label. #
170+
#########################################################################
171+
c = Counter(closest_y)
172+
y_pred[i] = c.most_common(1)[0][0]
173+
#########################################################################
174+
# END OF YOUR CODE #
175+
#########################################################################
176+
177+
return y_pred
178+

k nearest neighbor/knn.ipynb

Lines changed: 879 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)