kings186
diff --git a/‎PCA_Project2/PCA.py
Lines changed: 117 additions & 0 deletions b/‎PCA_Project2/PCA.py
Lines changed: 117 additions & 0 deletions
diff --git a/‎PCA_Project2/secom.data
Lines changed: 1567 additions & 0 deletions b/‎PCA_Project2/secom.data
Lines changed: 1567 additions & 0 deletions
diff --git a/‎Perceptron_Project1/DataSet.png
18.4 KB b/‎Perceptron_Project1/DataSet.png
18.4 KB
diff --git a/‎Perceptron_Project1/Perceptron.py
Lines changed: 188 additions & 0 deletions b/‎Perceptron_Project1/Perceptron.py
Lines changed: 188 additions & 0 deletions
diff --git a/‎Perceptron_Project1/决策区域图像.png
17.5 KB b/‎Perceptron_Project1/决策区域图像.png
17.5 KB
diff --git a/‎Perceptron_Project1/每次迭代错误分类数量.png
21.1 KB b/‎Perceptron_Project1/每次迭代错误分类数量.png
21.1 KB
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  7 09:50:58 2018
+
+@author: wzy
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+
+"""
+函数说明：解析文本数据
+
+Parameters:
+    filename - 文件名
+    delim - 每一行不同特征数据之间的分隔方式，默认是tab键‘\t’
+    
+Returns:
+    j将float型数据值列表转化为矩阵返回
+
+Modify:
+    2018-08-07
+"""
+def loadDataSet(filename, delim='\t'):
+    fr = open(filename)
+    stringArr = [line.strip().split(delim) for line in fr.readlines()]
+    datArr = [list(map(float, line)) for line in stringArr]
+    return np.mat(datArr)
+
+
+"""
+函数说明：PCA特征维度压缩函数
+
+Parameters:
+    dataMat - 数据集数据
+    topNfeat - 需要保留的特征维度，即要压缩成的维度数，默认4096
+    
+Returns:
+    lowDDataMat - 压缩后的数据矩阵
+    reconMat - 压缩后的数据矩阵反构出原始数据矩阵
+
+Modify:
+    2018-08-07
+"""
+def pca(dataMat, topNfeat=4096):
+    # 求矩阵每一列的均值
+    meanVals = np.mean(dataMat, axis=0)
+    # 数据矩阵每一列特征减去该列特征均值
+    meanRemoved = dataMat - meanVals
+    # 计算协方差矩阵，处以n-1是为了得到协方差的无偏估计
+    # cov(x, 0) = cov(x)除数是n-1(n为样本个数)
+    # cov(x, 1)除数是n
+    covMat = np.cov(meanRemoved, rowvar=0)
+    # 计算协方差矩阵的特征值及对应的特征向量
+    # 均保存在相应的矩阵中
+    eigVals, eigVects = np.linalg.eig(np.mat(covMat))
+    # sort():对特征值矩阵排序(由小到大)
+    # argsort():对特征矩阵进行由小到大排序，返回对应排序后的索引
+    eigValInd = np.argsort(eigVals)
+    # 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值，返回其对应的索引
+    eigValInd = eigValInd[: -(topNfeat+1): -1]
+    # 将特征值最大的N个特征值对应索引的特征向量提取出来，组成压缩矩阵
+    redEigVects = eigVects[:, eigValInd]
+    # 将去除均值后的矩阵*压缩矩阵，转换到新的空间，使维度降低为N
+    lowDDataMat = meanRemoved * redEigVects
+    # 利用降维后的矩阵反构出原数据矩阵(用作测试，可跟未压缩的原矩阵比对)
+    # 此处用转置和逆的结果一样redEigVects.I
+    reconMat = (lowDDataMat * redEigVects.T) + meanVals
+    print(reconMat)
+    # 返回压缩后的数据矩阵及该矩阵反构出原始数据矩阵
+    return lowDDataMat, reconMat
+
+
+"""
+函数说明：缺失值处理函数
+
+Parameters:
+    None
+    
+Returns:
+    datMat - 处理后的数据集
+
+Modify:
+    2018-08-07
+"""
+def replaceNaNWithMean():
+    # 解析数据
+    datMat = loadDataSet('secom.data', ' ')
+    # 获取特征维度
+    numFeat = np.shape(datMat)[1]
+    for i in range(numFeat):
+        # 利用该维度所有非NaN特征求取均值
+        meanVal = np.mean(datMat[np.nonzero(~np.isnan(datMat[:, 1].A))[0], i])
+        # 若均值也是NaN则用0代替
+        if (np.isnan(meanVal)):
+            meanVal = 0
+        # 将该维度中所有NaN特征全部用均值替换
+        datMat[np.nonzero(np.isnan(datMat[:, i].A))[0], i] = meanVal
+    return datMat
+
+
+if __name__ == '__main__':
+    dataMat = replaceNaNWithMean()
+    # lowDmat, reconMat = pca(dataMat, topNfeat=20)
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    # ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s=90, c='red')
+    meanVals = np.mean(dataMat, axis=0)
+    meanRemoved = dataMat - meanVals
+    covMat = np.cov(meanRemoved, rowvar=0)
+    eigVals, eigVects = np.linalg.eig(np.mat(covMat))
+    # 特征数
+    i = 6
+    ax.scatter(range(i), eigVals[:i], marker='^', s=50, c='red')
+    ax.plot(range(i), eigVals[:i])
+    lowDmat, reconMat = pca(dataMat, topNfeat=i)
+    # 提取的6个特征
+    print(lowDmat)
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug 28 10:31:10 2018
+
+@author: wzy
+"""
+# 二维数据集决策边界可视化
+from matplotlib.colors import ListedColormap
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+
+"""
+类说明：构建感知机
+
+Parameters:
+    eta - 学习率(0,1]
+    n_iter - 迭代次数
+    w_ - 训练后的权重数组
+    errors_ - 每轮训练后的误差
+    
+Returns:
+    None
+
+Modify:
+    2018-08-28
+"""
+class Perceptron(object):
+    def __init__(self, eta=0.01, n_iter=10):
+        self.eta = eta
+        self.n_iter = n_iter
+    
+    """
+    Fit training data
+    Parameters
+    X: {array-like}, shape=[n_samples, n_features]
+       Trainig vectors, where n_samples is the number of samples and n_features is the number of features.
+    y: array-like, shape = [n_samples]
+       Target values.
+    Returns
+    self: object
+    """
+    def fit(self, X, y):
+        # self.w_中的权值初始化为一个零向量R(m+1),其中m是数据集中维度(特征)的数量
+        # 我们在此基础上增加一个0权重列(也就是阈值)
+        self.w_ = np.zeros(1 + X.shape[1])
+        self.errors_ = []
+        for _ in range(self.n_iter):
+            errors = 0
+            for xi, target in zip(X, y):
+                update = self.eta * (target - self.predict(xi))
+                self.w_[1:] += update * xi
+                self.w_[0] += update
+                # 每轮中错分类样本的数量
+                errors += int(update != 0.0)
+            self.errors_.append(errors)
+        return self
+    
+    def net_input(self, X):
+        """Calculate net input"""
+        # 计算X和w_的点积 
+        return np.dot(X, self.w_[1:]) + self.w_[0]
+    
+    def predict(self, X):
+        """Return class label after unit step"""
+        return np.where(self.net_input(X) >= 0.0, 1, -1)
+
+
+"""
+函数说明：导入数据集
+
+Parameters:
+    None
+    
+Returns:
+    X - 特征矩阵
+    y - label列向量
+
+Modify:
+    2018-08-28
+"""
+def DataSet():
+    # 使用pandas库直接从UCI机器学习库中将鸢尾花数据集转换为DataFrame对象并加载到内存中
+    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
+    # 使用tail方法显示数据最后5行以保证数据正确加载
+    df.tail()
+    # 提取前100个类标，50个山鸢尾类标，50个变色鸢尾类标
+    # iloc works on the positions in the index (so it only takes integers).
+    y = df.iloc[0:100, 4].values
+    # -1代表山鸢尾 1代表变色鸢尾，将label存到y中
+    # np.where用法相当于C语言的 ? : 
+    y = np.where(y == 'Iris-setosa', -1, 1)
+    # 提取特征0和特征1
+    X = df.iloc[0:100, [0, 2]].values
+    # 绘制散点图
+    plt.scatter(X[:50, 0], X[:50, 1], color='red', marker='o', label='setosa')
+    plt.scatter(X[50:100, 0], X[50:100, 1], color='blue', marker='x', label='versicolor')
+    # 花瓣长度
+    plt.xlabel('petal length')
+    # 萼片长度
+    plt.ylabel('sepal length')
+    plt.legend(loc='upper left')
+    plt.show()
+    # X为特征矩阵，y为label列向量（-1，1）
+    return X, y
+
+
+"""
+函数说明：绘制迭代次数与误分点个数之间的关系
+
+Parameters:
+    None
+    
+Returns:
+    None
+
+Modify:
+    2018-08-28
+"""
+def NumOfErrors():
+    # 导入数据
+    X, y = DataSet()
+    # 实例化感知机
+    ppn = Perceptron(eta=0.1, n_iter=10)
+    # 训练模型
+    ppn.fit(X, y)
+    # 绘制迭代次数与误分点个数之间的关系
+    plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o')
+    plt.xlabel('Epochs')
+    plt.ylabel('Number of misclassifications')
+    plt.show()
+    
+
+"""
+函数说明：绘制决策区域图像
+
+Parameters:
+    X - 特征矩阵
+    y - label列向量
+    classifier - 分类器
+    resolution - 采样间隔为0.02
+    
+Returns:
+    None
+
+Modify:
+    2018-08-28
+"""
+def plot_decision_regions(X, y, classifier, resolution=0.02):
+    # 散点样式
+    markers = ('s', 'x', 'o', '^', 'v')
+    # 颜色元组
+    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
+    # np.unique该函数是去除数组中的重复数字，并进行排序之后输出。
+    # ListedColormap主要用于生成非渐变的颜色映射
+    cmap = ListedColormap(colors[:len(np.unique(y))])
+    # 横轴范围
+    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+    # 纵轴范围
+    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+    # meshgrid函数将最大值、最小值向量生成二维数组xx1和xx2
+    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), 
+                           np.arange(x2_min, x2_max, resolution))
+    z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
+    z = z.reshape(xx1.shape)
+    plt.contourf(xx1, xx2, z, alpha=0.4, cmap=cmap)
+    plt.xlim(xx1.min(), xx1.max())
+    plt.ylim(xx2.min(), xx2.max())
+    # 对于一个可迭代的（iterable）/可遍历的对象（如列表、字符串），enumerate将其组成一个索引序列，利用它可以同时获得索引和值
+    for idx, cl in enumerate(np.unique(y)):
+        plt.scatter(x=X[y==cl, 0], y=X[y==cl, 1], alpha=0.8, c=cmap(idx), marker=markers[idx], label=cl)
+        
+
+if __name__ == '__main__':
+    # 导入数据
+    X, y = DataSet()
+    # 实例化感知机
+    ppn = Perceptron(eta=0.1, n_iter=10)
+    # 训练模型
+    ppn.fit(X, y)
+    plot_decision_regions(X, y, classifier=ppn)
+    # 萼片长度
+    plt.xlabel('sepal length [cm]')
+    # 花瓣长度
+    plt.ylabel('petal length [cm]')
+    plt.legend(loc='upper left')
+    plt.show()
+