#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import cv2 import random import time from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # 数据集是mnist,28\*28,这里选择提取HOG特征,方向梯度直方图(Histogram of Oriented Gradient, HOG): # In[2]: raw_data = pd.read_csv('../data/train.csv',header=0) # In[4]: raw_data.head() # In[7]: raw_data.shape # 两个冒号的语法: # seq[start:end:step] # 原来是 # imgs = data[0::,1::] # labels = data[::,0] # 没必要这样写 # In[14]: data = raw_data.values imgs = data[:, 1:] labels = data[:, 0] # In[15]: imgs.shape # In[29]: np.unique(labels) # In[18]: # 利用opencv获取图像hog特征 def get_hog_features(trainset): features = [] hog = cv2.HOGDescriptor('../hog.xml') for img in trainset: img = np.reshape(img,(28,28)) cv_img = img.astype(np.uint8) hog_feature = hog.compute(cv_img) # hog_feature = np.transpose(hog_feature) features.append(hog_feature) features = np.array(features) features = np.reshape(features,(-1,324)) return features # In[19]: features = get_hog_features(imgs) # In[21]: features.shape # In[112]: labels.shape # In[22]: train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=23323) # # 预测 # # 因为knn不需要训练,我们可以直接进行预测。不过因为4万个数据即使是预测也非常花时间,这里只取前100个样本做训练集,去30个样本做测试集: # In[113]: testset, trainset, train_labels = test_features[:30], train_features[:100], train_labels[:100] # In[121]: k = 10 # 最近的10个点 predict = [] count = 0 # In[122]: # 计算两个点的欧氏距离 np.linalg.norm(np.array([0, 3]) - np.array([4, 0])) # In[123]: time_1 = time.time() for test_vec in testset: # 输出当前运行的测试用例坐标,用于测试 count += 1 if count % 5000 == 0: print(count) knn_list = np.zeros((1, 2)) # 初始化,存放当前k个最近邻居 # 先将前k个点放入k个最近邻居中,填充满knn_list for i in range(k): label = train_labels[i] train_vec = trainset[i] dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离 knn_list = np.append(knn_list, [[dist, label]], axis=0) # 剩下的点 for i in range(k, len(train_labels)): label = train_labels[i] train_vec = trainset[i] dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离 # 寻找10个邻近点中距离最远的点 max_index = np.argmax(knn_list[:, 0]) max_dist = np.max(knn_list[:, 0]) # 如果当前k个最近邻居中存在点距离比当前点距离远,则替换 if dist < max_dist: knn_list[max_index] = [dist, label] # 上面代码计算全部运算完之后,即说明已经找到了离当前test_vec最近的10个train_vec # 统计选票 class_total = 10 class_count = [0 for i in range(class_total)] for dist, label in knn_list: class_count[int(label)] += 1 # 找出最大选票数 label_max = max(class_count) # 最大选票数对应的class predict.append(class_count.index(label_max)) time_2 = time.time() # In[124]: print('train time is %s' % (time_2 - time_1)) # In[109]: print('train time is %s' % (5-2)) # In[106]: knn_list # In[90]: knn_list = np.array([]) # 当前k个最近邻居 # 先将前k个点放入k个最近邻居中,填充满knn_list for i in range(k): label = train_labels[i] train_vec = trainset[i] dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离 knn_list_test = np.append(knn_list_test, [[8.5, 9]], axis=0) # # 测试用 # # 下面自己写一个寻找10个领近点中距离最远的点: # In[96]: knn_list = np.zeros((1, 2)) # 当前k个最近邻居 knn_list # In[94]: np.append(knn_list, [[8.5, 9]], axis=0) # In[78]: knn_list_test = np.array([[2.3, 1], [3.5, 1], [1.5, 4], [6.5, 2], [5.5, 8]]) # 每个元组里,第一个是距离,第二个是对应标签 knn_list_test # In[79]: knn_list_test[:, 0] # In[80]: knn_list_test[2] = [9.5, 5] # In[81]: knn_list_test # 要想给一个ndarray添加一个元素,必须是同样的格式,即必须是`[[8.5, 9]]`,不能使`[8.5, 9]`,而且必须要用axis指定才行。 # In[86]: np.append(knn_list_test, [[8.5, 9]], axis=0) # In[87]: knn_list_test # In[37]: knn_list_test[:, 0].argmax() # In[41]: np.array([]) # # 输出评分 # # 统计结束后,得到predict # In[125]: len(predict) # In[127]: test_predict = np.array(predict) # In[128]: score = accuracy_score(test_labels[:30], test_predict) # In[129]: score # In[ ]: