1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Aug 28 10:31:10 2018
4
+
5
+ @author: wzy
6
+ """
7
+ # 二维数据集决策边界可视化
8
+ from matplotlib .colors import ListedColormap
9
+ import matplotlib .pyplot as plt
10
+ import pandas as pd
11
+ import numpy as np
12
+
13
+ """
14
+ 类说明:构建感知机
15
+
16
+ Parameters:
17
+ eta - 学习率(0,1]
18
+ n_iter - 迭代次数
19
+ w_ - 训练后的权重数组
20
+ errors_ - 每轮训练后的误差
21
+
22
+ Returns:
23
+ None
24
+
25
+ Modify:
26
+ 2018-08-28
27
+ """
28
+ class Perceptron (object ):
29
+ def __init__ (self , eta = 0.01 , n_iter = 10 ):
30
+ self .eta = eta
31
+ self .n_iter = n_iter
32
+
33
+ """
34
+ Fit training data
35
+ Parameters
36
+ X: {array-like}, shape=[n_samples, n_features]
37
+ Trainig vectors, where n_samples is the number of samples and n_features is the number of features.
38
+ y: array-like, shape = [n_samples]
39
+ Target values.
40
+ Returns
41
+ self: object
42
+ """
43
+ def fit (self , X , y ):
44
+ # self.w_中的权值初始化为一个零向量R(m+1),其中m是数据集中维度(特征)的数量
45
+ # 我们在此基础上增加一个0权重列(也就是阈值)
46
+ self .w_ = np .zeros (1 + X .shape [1 ])
47
+ self .errors_ = []
48
+ for _ in range (self .n_iter ):
49
+ errors = 0
50
+ for xi , target in zip (X , y ):
51
+ update = self .eta * (target - self .predict (xi ))
52
+ self .w_ [1 :] += update * xi
53
+ self .w_ [0 ] += update
54
+ # 每轮中错分类样本的数量
55
+ errors += int (update != 0.0 )
56
+ self .errors_ .append (errors )
57
+ return self
58
+
59
+ def net_input (self , X ):
60
+ """Calculate net input"""
61
+ # 计算X和w_的点积
62
+ return np .dot (X , self .w_ [1 :]) + self .w_ [0 ]
63
+
64
+ def predict (self , X ):
65
+ """Return class label after unit step"""
66
+ return np .where (self .net_input (X ) >= 0.0 , 1 , - 1 )
67
+
68
+
69
+ """
70
+ 函数说明:导入数据集
71
+
72
+ Parameters:
73
+ None
74
+
75
+ Returns:
76
+ X - 特征矩阵
77
+ y - label列向量
78
+
79
+ Modify:
80
+ 2018-08-28
81
+ """
82
+ def DataSet ():
83
+ # 使用pandas库直接从UCI机器学习库中将鸢尾花数据集转换为DataFrame对象并加载到内存中
84
+ df = pd .read_csv ('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' , header = None )
85
+ # 使用tail方法显示数据最后5行以保证数据正确加载
86
+ df .tail ()
87
+ # 提取前100个类标,50个山鸢尾类标,50个变色鸢尾类标
88
+ # iloc works on the positions in the index (so it only takes integers).
89
+ y = df .iloc [0 :100 , 4 ].values
90
+ # -1代表山鸢尾 1代表变色鸢尾,将label存到y中
91
+ # np.where用法相当于C语言的 ? :
92
+ y = np .where (y == 'Iris-setosa' , - 1 , 1 )
93
+ # 提取特征0和特征1
94
+ X = df .iloc [0 :100 , [0 , 2 ]].values
95
+ # 绘制散点图
96
+ plt .scatter (X [:50 , 0 ], X [:50 , 1 ], color = 'red' , marker = 'o' , label = 'setosa' )
97
+ plt .scatter (X [50 :100 , 0 ], X [50 :100 , 1 ], color = 'blue' , marker = 'x' , label = 'versicolor' )
98
+ # 花瓣长度
99
+ plt .xlabel ('petal length' )
100
+ # 萼片长度
101
+ plt .ylabel ('sepal length' )
102
+ plt .legend (loc = 'upper left' )
103
+ plt .show ()
104
+ # X为特征矩阵,y为label列向量(-1,1)
105
+ return X , y
106
+
107
+
108
+ """
109
+ 函数说明:绘制迭代次数与误分点个数之间的关系
110
+
111
+ Parameters:
112
+ None
113
+
114
+ Returns:
115
+ None
116
+
117
+ Modify:
118
+ 2018-08-28
119
+ """
120
+ def NumOfErrors ():
121
+ # 导入数据
122
+ X , y = DataSet ()
123
+ # 实例化感知机
124
+ ppn = Perceptron (eta = 0.1 , n_iter = 10 )
125
+ # 训练模型
126
+ ppn .fit (X , y )
127
+ # 绘制迭代次数与误分点个数之间的关系
128
+ plt .plot (range (1 , len (ppn .errors_ ) + 1 ), ppn .errors_ , marker = 'o' )
129
+ plt .xlabel ('Epochs' )
130
+ plt .ylabel ('Number of misclassifications' )
131
+ plt .show ()
132
+
133
+
134
+ """
135
+ 函数说明:绘制决策区域图像
136
+
137
+ Parameters:
138
+ X - 特征矩阵
139
+ y - label列向量
140
+ classifier - 分类器
141
+ resolution - 采样间隔为0.02
142
+
143
+ Returns:
144
+ None
145
+
146
+ Modify:
147
+ 2018-08-28
148
+ """
149
+ def plot_decision_regions (X , y , classifier , resolution = 0.02 ):
150
+ # 散点样式
151
+ markers = ('s' , 'x' , 'o' , '^' , 'v' )
152
+ # 颜色元组
153
+ colors = ('red' , 'blue' , 'lightgreen' , 'gray' , 'cyan' )
154
+ # np.unique该函数是去除数组中的重复数字,并进行排序之后输出。
155
+ # ListedColormap主要用于生成非渐变的颜色映射
156
+ cmap = ListedColormap (colors [:len (np .unique (y ))])
157
+ # 横轴范围
158
+ x1_min , x1_max = X [:, 0 ].min () - 1 , X [:, 0 ].max () + 1
159
+ # 纵轴范围
160
+ x2_min , x2_max = X [:, 1 ].min () - 1 , X [:, 1 ].max () + 1
161
+ # meshgrid函数将最大值、最小值向量生成二维数组xx1和xx2
162
+ xx1 , xx2 = np .meshgrid (np .arange (x1_min , x1_max , resolution ),
163
+ np .arange (x2_min , x2_max , resolution ))
164
+ z = classifier .predict (np .array ([xx1 .ravel (), xx2 .ravel ()]).T )
165
+ z = z .reshape (xx1 .shape )
166
+ plt .contourf (xx1 , xx2 , z , alpha = 0.4 , cmap = cmap )
167
+ plt .xlim (xx1 .min (), xx1 .max ())
168
+ plt .ylim (xx2 .min (), xx2 .max ())
169
+ # 对于一个可迭代的(iterable)/可遍历的对象(如列表、字符串),enumerate将其组成一个索引序列,利用它可以同时获得索引和值
170
+ for idx , cl in enumerate (np .unique (y )):
171
+ plt .scatter (x = X [y == cl , 0 ], y = X [y == cl , 1 ], alpha = 0.8 , c = cmap (idx ), marker = markers [idx ], label = cl )
172
+
173
+
174
+ if __name__ == '__main__' :
175
+ # 导入数据
176
+ X , y = DataSet ()
177
+ # 实例化感知机
178
+ ppn = Perceptron (eta = 0.1 , n_iter = 10 )
179
+ # 训练模型
180
+ ppn .fit (X , y )
181
+ plot_decision_regions (X , y , classifier = ppn )
182
+ # 萼片长度
183
+ plt .xlabel ('sepal length [cm]' )
184
+ # 花瓣长度
185
+ plt .ylabel ('petal length [cm]' )
186
+ plt .legend (loc = 'upper left' )
187
+ plt .show ()
188
+
0 commit comments