import pandas as pd
import numpy as np
import cv2
import random
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
数据集是mnist,28*28,这里选择提取HOG特征,方向梯度直方图(Histogram of Oriented Gradient, HOG):
raw_data = pd.read_csv('../data/train.csv',header=0)
raw_data.head()
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
raw_data.shape
(42000, 785)
两个冒号的语法: seq[start:end:step] 原来是 imgs = data[0::,1::] labels = data[::,0] 没必要这样写
data = raw_data.values
imgs = data[:, 1:]
labels = data[:, 0]
imgs.shape
(42000, 784)
np.unique(labels)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# 利用opencv获取图像hog特征
def get_hog_features(trainset):
features = []
hog = cv2.HOGDescriptor('../hog.xml')
for img in trainset:
img = np.reshape(img,(28,28))
cv_img = img.astype(np.uint8)
hog_feature = hog.compute(cv_img)
# hog_feature = np.transpose(hog_feature)
features.append(hog_feature)
features = np.array(features)
features = np.reshape(features,(-1,324))
return features
features = get_hog_features(imgs)
features.shape
(42000, 324)
labels.shape
(42000,)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=23323)
因为knn不需要训练,我们可以直接进行预测。不过因为4万个数据即使是预测也非常花时间,这里只取前100个样本做训练集,去30个样本做测试集:
testset, trainset, train_labels = test_features[:30], train_features[:100], train_labels[:100]
k = 10 # 最近的10个点
predict = []
count = 0
# 计算两个点的欧氏距离
np.linalg.norm(np.array([0, 3]) - np.array([4, 0]))
5.0
time_1 = time.time()
for test_vec in testset:
# 输出当前运行的测试用例坐标,用于测试
count += 1
if count % 5000 == 0:
print(count)
knn_list = np.zeros((1, 2)) # 初始化,存放当前k个最近邻居
# 先将前k个点放入k个最近邻居中,填充满knn_list
for i in range(k):
label = train_labels[i]
train_vec = trainset[i]
dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离
knn_list = np.append(knn_list, [[dist, label]], axis=0)
# 剩下的点
for i in range(k, len(train_labels)):
label = train_labels[i]
train_vec = trainset[i]
dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离
# 寻找10个邻近点中距离最远的点
max_index = np.argmax(knn_list[:, 0])
max_dist = np.max(knn_list[:, 0])
# 如果当前k个最近邻居中存在点距离比当前点距离远,则替换
if dist < max_dist:
knn_list[max_index] = [dist, label]
# 上面代码计算全部运算完之后,即说明已经找到了离当前test_vec最近的10个train_vec
# 统计选票
class_total = 10
class_count = [0 for i in range(class_total)]
for dist, label in knn_list:
class_count[int(label)] += 1
# 找出最大选票数
label_max = max(class_count)
# 最大选票数对应的class
predict.append(class_count.index(label_max))
time_2 = time.time()
print('train time is %s' % (time_2 - time_1))
train time is 0.07612895965576172
print('train time is %s' % (5-2))
train time is 3
knn_list
array([[ 0. , 0. ], [ 1.10036302, 3. ], [ 1.09803486, 3. ], [ 1.09235775, 3. ], [ 1.03992426, 3. ], [ 1.04467952, 3. ], [ 1.06501627, 3. ], [ 0.93764162, 3. ], [ 1.05351973, 3. ], [ 1.04691565, 3. ], [ 0.9816038 , 3. ]])
knn_list = np.array([]) # 当前k个最近邻居
# 先将前k个点放入k个最近邻居中,填充满knn_list
for i in range(k):
label = train_labels[i]
train_vec = trainset[i]
dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离
knn_list_test = np.append(knn_list_test, [[8.5, 9]], axis=0)
array([], dtype=float64)
下面自己写一个寻找10个领近点中距离最远的点:
knn_list = np.zeros((1, 2)) # 当前k个最近邻居
knn_list
array([[ 0., 0.]])
np.append(knn_list, [[8.5, 9]], axis=0)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-94-9db999664314> in <module>() ----> 1 np.append(knn_list, [[8.5, 9]], axis=0) /Users/xu/anaconda/envs/py35/lib/python3.5/site-packages/numpy/lib/function_base.py in append(arr, values, axis) 5145 values = ravel(values) 5146 axis = arr.ndim-1 -> 5147 return concatenate((arr, values), axis=axis) ValueError: all the input array dimensions except for the concatenation axis must match exactly
knn_list_test = np.array([[2.3, 1], [3.5, 1], [1.5, 4], [6.5, 2], [5.5, 8]])
# 每个元组里,第一个是距离,第二个是对应标签
knn_list_test
array([[ 2.3, 1. ], [ 3.5, 1. ], [ 1.5, 4. ], [ 6.5, 2. ], [ 5.5, 8. ]])
knn_list_test[:, 0]
array([ 2.3, 3.5, 1.5, 6.5, 5.5])
knn_list_test[2] = [9.5, 5]
knn_list_test
array([[ 2.3, 1. ], [ 3.5, 1. ], [ 9.5, 5. ], [ 6.5, 2. ], [ 5.5, 8. ]])
要想给一个ndarray添加一个元素,必须是同样的格式,即必须是[[8.5, 9]]
,不能使[8.5, 9]
,而且必须要用axis指定才行。
np.append(knn_list_test, [[8.5, 9]], axis=0)
array([[ 2.3, 1. ], [ 3.5, 1. ], [ 9.5, 5. ], [ 6.5, 2. ], [ 5.5, 8. ], [ 8.5, 9. ], [ 8.5, 9. ]])
knn_list_test
array([[ 2.3, 1. ], [ 3.5, 1. ], [ 9.5, 5. ], [ 6.5, 2. ], [ 5.5, 8. ], [ 8.5, 9. ]])
knn_list_test[:, 0].argmax()
3
np.array([])
array([], dtype=float64)
统计结束后,得到predict
len(predict)
30
test_predict = np.array(predict)
score = accuracy_score(test_labels[:30], test_predict)
score
0.6333333333333333