Skip to content

Commit f108a70

Browse files
Merge pull request apachecn#230 from apachecn/dev
定期合并 - Dev
2 parents 8ed0400 + 79e1124 commit f108a70

File tree

3 files changed

+25
-34
lines changed

3 files changed

+25
-34
lines changed

competitions/getting-started/house-price/README.md

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,6 +1929,10 @@ mode_br.fit(x_train, y_train)
19291929
y_test = np.expm1(mode_br.predict(x_test))
19301930
```
19311931

1932+
## 四 建立模型
1933+
1934+
> 模型融合 voting
1935+
19321936
```python
19331937
# 模型融合
19341938
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
@@ -1989,15 +1993,3 @@ result['SalePrice'] = ensemble
19891993
# index=False 是用来除去行编号
19901994
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
19911995
```
1992-
1993-
Id SalePrice
1994-
0 1461 110469.586157
1995-
1 1462 148368.953437
1996-
2 1463 172697.673678
1997-
3 1464 189844.587562
1998-
4 1465 207009.716532
1999-
5 1466 188820.407208
2000-
6 1467 163107.556014
2001-
7 1468 180732.346459
2002-
8 1469 194841.804925
2003-
9 1470 110570.281362

src/python/getting-started/digit-recognizer/knn-python3.6.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
import pandas as pd
1515
from sklearn.decomposition import PCA
1616
from sklearn.neighbors import KNeighborsClassifier
17-
import sys
1817

19-
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
18+
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
19+
2020

2121
# 加载数据
2222
def opencsv():
@@ -31,18 +31,15 @@ def opencsv():
3131

3232

3333
def saveResult(result, csvName):
34-
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
34+
with open(csvName, 'w') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
3535
# python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
36-
myWriter = csv.writer(myFile) # 对文件执行写入
37-
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
36+
myWriter = csv.writer(myFile)
37+
myWriter.writerow(["ImageId", "Label"])
3838
index = 0
39-
for i in result:
40-
tmp = []
41-
index = index + 1
42-
tmp.append(index)
43-
# tmp.append(i)
44-
tmp.append(int(i)) # 测试集的标签值
45-
myWriter.writerow(tmp)
39+
for r in result:
40+
index += 1
41+
myWriter.writerow([index, int(r)])
42+
print('Saved successfully...') # 保存预测结果
4643

4744

4845
def knnClassify(trainData, trainLabel):
@@ -95,7 +92,7 @@ def dRecognition_knn():
9592

9693
# 结果预测
9794
testLabel = knnClf.predict(testData)
98-
95+
9996
# 结果的输出
10097
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv'))
10198
print("finish!")

src/python/getting-started/digit-recognizer/svm-python3.6.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
Update on 2017-10-26
77
Author: 片刻
88
Github: https://github.com/apachecn/kaggle
9-
PCA主成成分分析
109
'''
1110

1211
import os.path
@@ -21,7 +20,8 @@
2120
from sklearn.model_selection import train_test_split
2221

2322
# 数据路径
24-
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
23+
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
24+
2525

2626
# 加载数据
2727
def opencsv():
@@ -61,7 +61,6 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
6161
return pcaTrainData, pcaTestData, pcaPreData
6262

6363

64-
6564
# 训练模型
6665
def trainModel(trainData, trainLabel):
6766
print('Train SVM...')
@@ -85,20 +84,20 @@ def saveResult(result, csvName):
8584
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
8685
def analyse_data(dataMat):
8786
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
88-
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
87+
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
8988
# 计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
9089
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
9190
# cov(X,1) 除数是n
92-
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
91+
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
9392
# np.mat 是用来生成一个矩阵的
9493
# 保存特征值(eigvals)和对应的特征向量(eigVects)
95-
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
96-
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
94+
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
95+
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
9796

98-
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
97+
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
9998

10099
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
101-
eigValInd = eigValInd[:-(topNfeat+1):-1]
100+
eigValInd = eigValInd[:-(topNfeat+1):-1]
102101

103102
# 计算特征值的总和
104103
cov_all_score = float(sum(eigvals))
@@ -184,6 +183,7 @@ def getModel(filename):
184183
fr = open(filename, 'rb')
185184
return pickle.load(fr)
186185

186+
187187
def trainDRSVM():
188188
startTime = time.time()
189189

@@ -215,6 +215,7 @@ def preDRSVM():
215215
stopTime = time.time()
216216
print('PreModel load time used:%f s' % (stopTime - startTime))
217217

218+
218219
# 数据可视化
219220
def dataVisulization(data, labels):
220221
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
@@ -230,6 +231,7 @@ def dataVisulization(data, labels):
230231
plt.title('MNIST visualization')
231232
plt.show()
232233

234+
233235
if __name__ == '__main__':
234236
trainData, trainLabel, preData = opencsv()
235237
dataVisulization(trainData, trainLabel)

0 commit comments

Comments
 (0)