Skip to content

Commit 1c264ab

Browse files
authored
Add files via upload
1 parent 95d0dc6 commit 1c264ab

File tree

18 files changed

+10979
-0
lines changed

18 files changed

+10979
-0
lines changed

PCA_Project2/PCA.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Aug 7 09:50:58 2018
4+
5+
@author: wzy
6+
"""
7+
import numpy as np
8+
import matplotlib.pyplot as plt
9+
10+
"""
11+
函数说明:解析文本数据
12+
13+
Parameters:
14+
filename - 文件名
15+
delim - 每一行不同特征数据之间的分隔方式,默认是tab键‘\t
16+
17+
Returns:
18+
j将float型数据值列表转化为矩阵返回
19+
20+
Modify:
21+
2018-08-07
22+
"""
23+
def loadDataSet(filename, delim='\t'):
24+
fr = open(filename)
25+
stringArr = [line.strip().split(delim) for line in fr.readlines()]
26+
datArr = [list(map(float, line)) for line in stringArr]
27+
return np.mat(datArr)
28+
29+
30+
"""
31+
函数说明:PCA特征维度压缩函数
32+
33+
Parameters:
34+
dataMat - 数据集数据
35+
topNfeat - 需要保留的特征维度,即要压缩成的维度数,默认4096
36+
37+
Returns:
38+
lowDDataMat - 压缩后的数据矩阵
39+
reconMat - 压缩后的数据矩阵反构出原始数据矩阵
40+
41+
Modify:
42+
2018-08-07
43+
"""
44+
def pca(dataMat, topNfeat=4096):
45+
# 求矩阵每一列的均值
46+
meanVals = np.mean(dataMat, axis=0)
47+
# 数据矩阵每一列特征减去该列特征均值
48+
meanRemoved = dataMat - meanVals
49+
# 计算协方差矩阵,处以n-1是为了得到协方差的无偏估计
50+
# cov(x, 0) = cov(x)除数是n-1(n为样本个数)
51+
# cov(x, 1)除数是n
52+
covMat = np.cov(meanRemoved, rowvar=0)
53+
# 计算协方差矩阵的特征值及对应的特征向量
54+
# 均保存在相应的矩阵中
55+
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
56+
# sort():对特征值矩阵排序(由小到大)
57+
# argsort():对特征矩阵进行由小到大排序,返回对应排序后的索引
58+
eigValInd = np.argsort(eigVals)
59+
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
60+
eigValInd = eigValInd[: -(topNfeat+1): -1]
61+
# 将特征值最大的N个特征值对应索引的特征向量提取出来,组成压缩矩阵
62+
redEigVects = eigVects[:, eigValInd]
63+
# 将去除均值后的矩阵*压缩矩阵,转换到新的空间,使维度降低为N
64+
lowDDataMat = meanRemoved * redEigVects
65+
# 利用降维后的矩阵反构出原数据矩阵(用作测试,可跟未压缩的原矩阵比对)
66+
# 此处用转置和逆的结果一样redEigVects.I
67+
reconMat = (lowDDataMat * redEigVects.T) + meanVals
68+
print(reconMat)
69+
# 返回压缩后的数据矩阵及该矩阵反构出原始数据矩阵
70+
return lowDDataMat, reconMat
71+
72+
73+
"""
74+
函数说明:缺失值处理函数
75+
76+
Parameters:
77+
None
78+
79+
Returns:
80+
datMat - 处理后的数据集
81+
82+
Modify:
83+
2018-08-07
84+
"""
85+
def replaceNaNWithMean():
86+
# 解析数据
87+
datMat = loadDataSet('secom.data', ' ')
88+
# 获取特征维度
89+
numFeat = np.shape(datMat)[1]
90+
for i in range(numFeat):
91+
# 利用该维度所有非NaN特征求取均值
92+
meanVal = np.mean(datMat[np.nonzero(~np.isnan(datMat[:, 1].A))[0], i])
93+
# 若均值也是NaN则用0代替
94+
if (np.isnan(meanVal)):
95+
meanVal = 0
96+
# 将该维度中所有NaN特征全部用均值替换
97+
datMat[np.nonzero(np.isnan(datMat[:, i].A))[0], i] = meanVal
98+
return datMat
99+
100+
101+
if __name__ == '__main__':
102+
dataMat = replaceNaNWithMean()
103+
# lowDmat, reconMat = pca(dataMat, topNfeat=20)
104+
fig = plt.figure()
105+
ax = fig.add_subplot(111)
106+
# ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s=90, c='red')
107+
meanVals = np.mean(dataMat, axis=0)
108+
meanRemoved = dataMat - meanVals
109+
covMat = np.cov(meanRemoved, rowvar=0)
110+
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
111+
# 特征数
112+
i = 6
113+
ax.scatter(range(i), eigVals[:i], marker='^', s=50, c='red')
114+
ax.plot(range(i), eigVals[:i])
115+
lowDmat, reconMat = pca(dataMat, topNfeat=i)
116+
# 提取的6个特征
117+
print(lowDmat)

PCA_Project2/secom.data

Lines changed: 1567 additions & 0 deletions
Large diffs are not rendered by default.

Perceptron_Project1/DataSet.png

18.4 KB
Loading

Perceptron_Project1/Perceptron.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Aug 28 10:31:10 2018
4+
5+
@author: wzy
6+
"""
7+
# 二维数据集决策边界可视化
8+
from matplotlib.colors import ListedColormap
9+
import matplotlib.pyplot as plt
10+
import pandas as pd
11+
import numpy as np
12+
13+
"""
14+
类说明:构建感知机
15+
16+
Parameters:
17+
eta - 学习率(0,1]
18+
n_iter - 迭代次数
19+
w_ - 训练后的权重数组
20+
errors_ - 每轮训练后的误差
21+
22+
Returns:
23+
None
24+
25+
Modify:
26+
2018-08-28
27+
"""
28+
class Perceptron(object):
29+
def __init__(self, eta=0.01, n_iter=10):
30+
self.eta = eta
31+
self.n_iter = n_iter
32+
33+
"""
34+
Fit training data
35+
Parameters
36+
X: {array-like}, shape=[n_samples, n_features]
37+
Trainig vectors, where n_samples is the number of samples and n_features is the number of features.
38+
y: array-like, shape = [n_samples]
39+
Target values.
40+
Returns
41+
self: object
42+
"""
43+
def fit(self, X, y):
44+
# self.w_中的权值初始化为一个零向量R(m+1),其中m是数据集中维度(特征)的数量
45+
# 我们在此基础上增加一个0权重列(也就是阈值)
46+
self.w_ = np.zeros(1 + X.shape[1])
47+
self.errors_ = []
48+
for _ in range(self.n_iter):
49+
errors = 0
50+
for xi, target in zip(X, y):
51+
update = self.eta * (target - self.predict(xi))
52+
self.w_[1:] += update * xi
53+
self.w_[0] += update
54+
# 每轮中错分类样本的数量
55+
errors += int(update != 0.0)
56+
self.errors_.append(errors)
57+
return self
58+
59+
def net_input(self, X):
60+
"""Calculate net input"""
61+
# 计算X和w_的点积
62+
return np.dot(X, self.w_[1:]) + self.w_[0]
63+
64+
def predict(self, X):
65+
"""Return class label after unit step"""
66+
return np.where(self.net_input(X) >= 0.0, 1, -1)
67+
68+
69+
"""
70+
函数说明:导入数据集
71+
72+
Parameters:
73+
None
74+
75+
Returns:
76+
X - 特征矩阵
77+
y - label列向量
78+
79+
Modify:
80+
2018-08-28
81+
"""
82+
def DataSet():
83+
# 使用pandas库直接从UCI机器学习库中将鸢尾花数据集转换为DataFrame对象并加载到内存中
84+
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
85+
# 使用tail方法显示数据最后5行以保证数据正确加载
86+
df.tail()
87+
# 提取前100个类标,50个山鸢尾类标,50个变色鸢尾类标
88+
# iloc works on the positions in the index (so it only takes integers).
89+
y = df.iloc[0:100, 4].values
90+
# -1代表山鸢尾 1代表变色鸢尾,将label存到y中
91+
# np.where用法相当于C语言的 ? :
92+
y = np.where(y == 'Iris-setosa', -1, 1)
93+
# 提取特征0和特征1
94+
X = df.iloc[0:100, [0, 2]].values
95+
# 绘制散点图
96+
plt.scatter(X[:50, 0], X[:50, 1], color='red', marker='o', label='setosa')
97+
plt.scatter(X[50:100, 0], X[50:100, 1], color='blue', marker='x', label='versicolor')
98+
# 花瓣长度
99+
plt.xlabel('petal length')
100+
# 萼片长度
101+
plt.ylabel('sepal length')
102+
plt.legend(loc='upper left')
103+
plt.show()
104+
# X为特征矩阵,y为label列向量(-1,1)
105+
return X, y
106+
107+
108+
"""
109+
函数说明:绘制迭代次数与误分点个数之间的关系
110+
111+
Parameters:
112+
None
113+
114+
Returns:
115+
None
116+
117+
Modify:
118+
2018-08-28
119+
"""
120+
def NumOfErrors():
121+
# 导入数据
122+
X, y = DataSet()
123+
# 实例化感知机
124+
ppn = Perceptron(eta=0.1, n_iter=10)
125+
# 训练模型
126+
ppn.fit(X, y)
127+
# 绘制迭代次数与误分点个数之间的关系
128+
plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o')
129+
plt.xlabel('Epochs')
130+
plt.ylabel('Number of misclassifications')
131+
plt.show()
132+
133+
134+
"""
135+
函数说明:绘制决策区域图像
136+
137+
Parameters:
138+
X - 特征矩阵
139+
y - label列向量
140+
classifier - 分类器
141+
resolution - 采样间隔为0.02
142+
143+
Returns:
144+
None
145+
146+
Modify:
147+
2018-08-28
148+
"""
149+
def plot_decision_regions(X, y, classifier, resolution=0.02):
150+
# 散点样式
151+
markers = ('s', 'x', 'o', '^', 'v')
152+
# 颜色元组
153+
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
154+
# np.unique该函数是去除数组中的重复数字,并进行排序之后输出。
155+
# ListedColormap主要用于生成非渐变的颜色映射
156+
cmap = ListedColormap(colors[:len(np.unique(y))])
157+
# 横轴范围
158+
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
159+
# 纵轴范围
160+
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
161+
# meshgrid函数将最大值、最小值向量生成二维数组xx1和xx2
162+
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
163+
np.arange(x2_min, x2_max, resolution))
164+
z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
165+
z = z.reshape(xx1.shape)
166+
plt.contourf(xx1, xx2, z, alpha=0.4, cmap=cmap)
167+
plt.xlim(xx1.min(), xx1.max())
168+
plt.ylim(xx2.min(), xx2.max())
169+
# 对于一个可迭代的(iterable)/可遍历的对象(如列表、字符串),enumerate将其组成一个索引序列,利用它可以同时获得索引和值
170+
for idx, cl in enumerate(np.unique(y)):
171+
plt.scatter(x=X[y==cl, 0], y=X[y==cl, 1], alpha=0.8, c=cmap(idx), marker=markers[idx], label=cl)
172+
173+
174+
if __name__ == '__main__':
175+
# 导入数据
176+
X, y = DataSet()
177+
# 实例化感知机
178+
ppn = Perceptron(eta=0.1, n_iter=10)
179+
# 训练模型
180+
ppn.fit(X, y)
181+
plot_decision_regions(X, y, classifier=ppn)
182+
# 萼片长度
183+
plt.xlabel('sepal length [cm]')
184+
# 花瓣长度
185+
plt.ylabel('petal length [cm]')
186+
plt.legend(loc='upper left')
187+
plt.show()
188+
17.5 KB
Loading
Loading

0 commit comments

Comments
 (0)