Skip to content

Commit 20b675a

Browse files
Create linear_regression_train.py
1 parent bfd9ecd commit 20b675a

File tree

1 file changed

+143
-0
lines changed

1 file changed

+143
-0
lines changed
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# coding:UTF-8
2+
3+
import numpy as np
4+
from math import pow
5+
6+
def load_data(file_path):
7+
'''导入数据
8+
input: file_path(string):训练数据
9+
output: feature(mat):特征
10+
label(mat):标签
11+
'''
12+
f = open(file_path)
13+
feature = []
14+
label = []
15+
for line in f.readlines():
16+
feature_tmp = []
17+
lines = line.strip().split("\t")
18+
feature_tmp.append(1) # x0
19+
for i in xrange(len(lines) - 1):
20+
feature_tmp.append(float(lines[i]))
21+
feature.append(feature_tmp)
22+
label.append(float(lines[-1]))
23+
f.close()
24+
return np.mat(feature), np.mat(label).T
25+
26+
def least_square(feature, label):
27+
'''最小二乘法
28+
input: feature(mat):特征
29+
label(mat):标签
30+
output: w(mat):回归系数
31+
'''
32+
w = (feature.T * feature).I * feature.T * label
33+
return w
34+
35+
def first_derivativ(feature, label, w):
36+
'''计算一阶导函数的值
37+
input: feature(mat):特征
38+
label(mat):标签
39+
output: g(mat):一阶导数值
40+
'''
41+
m, n = np.shape(feature)
42+
g = np.mat(np.zeros((n, 1)))
43+
for i in xrange(m):
44+
err = label[i, 0] - feature[i, ] * w
45+
for j in xrange(n):
46+
g[j, ] -= err * feature[i, j]
47+
return g
48+
49+
def second_derivative(feature):
50+
'''计算二阶导函数的值
51+
input: feature(mat):特征
52+
output: G(mat):二阶导数值
53+
'''
54+
m, n = np.shape(feature)
55+
G = np.mat(np.zeros((n, n)))
56+
for i in xrange(m):
57+
x_left = feature[i, ].T
58+
x_right = feature[i, ]
59+
G += x_left * x_right
60+
return G
61+
62+
def get_error(feature, label, w):
63+
'''计算误差
64+
input: feature(mat):特征
65+
label(mat):标签
66+
w(mat):线性回归模型的参数
67+
output: 损失函数值
68+
'''
69+
return (label - feature * w).T * (label - feature * w) / 2
70+
71+
def get_min_m(feature, label, sigma, delta, d, w, g):
72+
'''计算步长中最小的值m
73+
input: feature(mat):特征
74+
label(mat):标签
75+
sigma(float),delta(float):全局牛顿法的参数
76+
d(mat):负的一阶导数除以二阶导数值
77+
g(mat):一阶导数值
78+
output: m(int):最小m值
79+
'''
80+
m = 0
81+
while True:
82+
w_new = w + pow(sigma, m) * d
83+
left = get_error(feature, label , w_new)
84+
right = get_error(feature, label , w) + delta * pow(sigma, m) * g.T * d
85+
if left <= right:
86+
break
87+
else:
88+
m += 1
89+
return m
90+
91+
def newton(feature, label, iterMax, sigma, delta):
92+
'''牛顿法
93+
input: feature(mat):特征
94+
label(mat):标签
95+
iterMax(int):最大迭代次数
96+
sigma(float), delta(float):牛顿法中的参数
97+
output: w(mat):回归系数
98+
'''
99+
n = np.shape(feature)[1]
100+
w = np.mat(np.zeros((n, 1)))
101+
it = 0
102+
while it <= iterMax:
103+
# print it
104+
g = first_derivativ(feature, label, w) # 一阶导数
105+
G = second_derivative(feature) # 二阶导数
106+
d = -G.I * g
107+
m = get_min_m(feature, label, sigma, delta, d, w, g) # 得到最小的m
108+
w = w + pow(sigma, m) * d
109+
if it % 10 == 0:
110+
print "\t---- itration: ", it, " , error: ", get_error(feature, label , w)[0, 0]
111+
it += 1
112+
return w
113+
114+
def save_model(file_name, w):
115+
'''保存最终的模型
116+
input: file_name(string):要保存的文件的名称
117+
w(mat):训练好的线性回归模型
118+
'''
119+
f_result = open(file_name, "w")
120+
m, n = np.shape(w)
121+
for i in xrange(m):
122+
w_tmp = []
123+
for j in xrange(n):
124+
w_tmp.append(str(w[i, j]))
125+
f_result.write("\t".join(w_tmp) + "\n")
126+
f_result.close()
127+
128+
129+
if __name__ == "__main__":
130+
# 1、导入数据集
131+
print "----------- 1.load data ----------"
132+
feature, label = load_data("data.txt")
133+
# 2.1、最小二乘求解
134+
print "----------- 2.training ----------"
135+
# print "\t ---------- least_square ----------"
136+
# w_ls = least_square(feature, label)
137+
# 2.2、牛顿法
138+
print "\t ---------- newton ----------"
139+
w_newton = newton(feature, label, 50, 0.1, 0.5)
140+
# 3、保存最终的结果
141+
print "----------- 3.save result ----------"
142+
save_model("weights", w_newton)
143+

0 commit comments

Comments
 (0)