Skip to content

Commit ea4473e

Browse files
committed
finished recursion!
1 parent 84311cc commit ea4473e

File tree

10 files changed

+223
-80
lines changed

10 files changed

+223
-80
lines changed

MDP.m

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
function Policy = MDP(state, noise, S, A, dt)
2+
% Returns the optimal policy of inverted pendulum
3+
% with given a set of states S, possible actions A
4+
% O(A * S^2)
5+
6+
numStates = state.numStates;
7+
discount = 0.9;
8+
9+
% Generate all possible state vectors
10+
[Thetas, ThetaDots] = meshgrid(S(1,:), S(2,:));
11+
vS = [reshape(Thetas, 1, numel(Thetas)); reshape(ThetaDots, 1, numel(ThetaDots))];
12+
13+
for i = 1:length(vS)
14+
s = vS(:,i);
15+
Policy(:,i) = s;
16+
bestActions(:,i) = VStar(discount, state, noise, S, vS, A, dt, S(:,2));
17+
end
18+
19+
Policy = [Policy; bestActions];
20+
21+
end
22+
23+
function a = VStar(discount, state, noise, S, vS, A, dt, s)
24+
% Given a state s, compute the expection for every action for every
25+
% possible future state. Return the max.
26+
27+
for i = 1:length(A)
28+
29+
% Commit to action a
30+
a = A(1, i);
31+
R(1, i) = a;
32+
R(2,i) = 0;
33+
depth = 0;
34+
35+
% Compute the next state for the given
36+
sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
37+
sPrime = mapToDiscreteValue(S, sPrime);
38+
T = transitionProbabilities(S, sPrime, state, noise);
39+
for j = 1:1%length(vS)
40+
sPrime = vS(:,j);
41+
psPrime = getTransitionProbability(T, vS, sPrime);
42+
% Bellman Equation. Sum of future rewards
43+
if abs(sPrime(1,1)) >= 45
44+
R(2,i) += psPrime * getReward(sPrime);
45+
else
46+
R(2,i) += psPrime * (getReward(sPrime) + ...
47+
discount * QStar(++depth, discount, state, noise, S, vS, A, dt, sPrime));
48+
end
49+
end
50+
end
51+
52+
a = R(1,1);
53+
maxIndex = 1;
54+
for i = 2:length(R)
55+
if R(2,i) > R(2, maxIndex)
56+
a = R(1,i);
57+
maxIndex = i;
58+
end
59+
end
60+
61+
62+
end
63+
64+
65+
66+
function r = QStar(depth, discount, state, noise, S, vS, A, dt, s)
67+
% Given a state and action compute the sum of the rewards
68+
% for all future states
69+
r = 0;
70+
if depth >= 5
71+
return;
72+
end
73+
74+
for i = 1:length(A)
75+
a = A(1, i);
76+
sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
77+
sPrime = mapToDiscreteValue(S, sPrime);
78+
T = transitionProbabilities(S, sPrime, state, noise);
79+
80+
for j = 1:length(vS)
81+
sPrime = vS(:,j);
82+
psPrime = getTransitionProbability(T, vS, sPrime);
83+
% Bellman Equation. Sum of future rewards
84+
if abs(sPrime(1,1)) >= 45
85+
r += psPrime * getReward(sPrime);
86+
else
87+
r += psPrime * (getReward(sPrime) + ...
88+
discount * QStar(++depth, discount, state, noise, S, vS, A, dt, sPrime));
89+
end
90+
end
91+
end
92+
end
93+
94+
function ps = getTransitionProbability(T, S, sPrime)
95+
% Given a state and transition matrix, return the transition probability
96+
% for that state
97+
98+
for i = 1:length(S)
99+
if sPrime(1,1) == S(1,i) && sPrime(2,1) == S(2,i)
100+
ps = T(1,i) * T(2,i);
101+
end
102+
end
103+
104+
end
105+
106+
function sPrime = mapToDiscreteValue(S, sPrime)
107+
% Map the real value given to us by the pendulum dynamics function
108+
% to a discrete state
109+
110+
theta = sPrime(1,1);
111+
thetaDot = sPrime(2,1);
112+
113+
if theta <= S(1,1)
114+
theta = S(1,1);
115+
elseif theta >= S(1, length(S))
116+
theta = S(1,length(S));
117+
else
118+
119+
for i = 1:length(S) - 1
120+
if theta >= S(1,i) && theta <= S(1,i + 1)
121+
left = theta - S(1,i);
122+
right = S(1,i + 1) - theta;
123+
124+
if(left < right)
125+
theta = S(1,i);
126+
else
127+
theta = S(1,i+1);
128+
end
129+
break;
130+
end
131+
end
132+
end
133+
134+
if thetaDot <= S(2,1)
135+
thetaDot = S(2,1);
136+
elseif thetaDot >= S(2, length(S))
137+
thetaDot = S(2,length(S));
138+
else
139+
140+
for i = 1:length(S) - 1
141+
if thetaDot >= S(2,i) && thetaDot <= S(2,i + 1)
142+
%left = thetaDot - S(2,i);
143+
%right = S(2,i + 1) - thetaDot;
144+
145+
%if left < right
146+
%thetaDot = S(2,i);
147+
%else
148+
%thetaDot = S(2,i+1);
149+
%end
150+
break;
151+
end
152+
end
153+
end
154+
155+
sPrime = [theta;thetaDot];
156+
157+
end

VStar.m

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
function bestAction = vStar(s)
2+
bestAction = 1
3+
end

addNoise.m

Lines changed: 0 additions & 10 deletions
This file was deleted.

getReward.m

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
function r = getReward(s)
2+
theta = s(1,1);
3+
thetaDot = s(2,1);
4+
r = 0;
5+
6+
if theta == 0
7+
r = 2;
8+
elseif theta > 0 && thetaDot < 0
9+
r = 1;
10+
elseif theta < 0 && thetaDot > 0
11+
r = 1;
12+
elseif theta > 45 || theta < -45
13+
r = -10;
14+
end
15+
16+
end

initializeMDP.m

Lines changed: 0 additions & 34 deletions
This file was deleted.

main.m

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,64 @@
1-
deltaT = 0.1; % Seconds
2-
maxIterations = 500;
1+
% Time step size
2+
dt = 0.1; % Seconds
3+
maxIterations = 100;
34

5+
% Number of dimensions state vector is in
46
dimensions = 2;
57

8+
% Noise contains all noise parameters
69
noise.mu = zeros(dimensions, 1);
710
noise.covariance = eye(dimensions) * 0.1;
811

12+
% State is a struct containing all state parameters
913
state.stateBounds = [-pi/4, pi/4; -1, 1];
10-
state.numStates = 3;
14+
state.numStates = 10;
1115

16+
% Calculates the step size between each upper and lower bound
1217
for dimension = 1:dimensions
1318
% Step size is (outer bound - inner bound) / number of states
1419
state.stepSize(dimension, 1) = (...
1520
(state.stateBounds(dimension, 2) - state.stateBounds(dimension, 1))/...
1621
state.numStates);
1722
end
1823

24+
% Set of actions
25+
A = [-2, -1, 0, 1, 2];
1926

20-
A = [-2, -1, 0, 1, 2]; % set of actions
27+
% Set of states
28+
S = [linspace(-pi/4, pi/4, state.numStates); linspace(-1, 1, state.numStates)];
2129

30+
% Policy is of length numStates and contains the optimal action a
31+
% in the corresponding column of state s in the set S
32+
Policy = MDP(state, noise, S, A, dt, dimensions)
2233

23-
S = [linspace(-pi/4, pi/4, state.numStates); linspace(-1, 1, state.numStates)]
24-
sPrime = [pi/4; 1]
25-
26-
transitionProbabilities(S, sPrime, state, noise)
27-
28-
%[S, A, T, R] = initializeMDP(state, A, deltaT);
29-
30-
discountFactor = 0.9; % discount
31-
convergance = 0.001; % stop after gamma * Q*(s) == 0.001
32-
33-
34-
%theta = 0.1;
35-
%thetaDot = 0;
36-
%data(1,1) = theta;
37-
%data(1,2) = thetaDot;
34+
theta = 30;
35+
thetaDot = 0;
36+
data(1,1) = theta;
37+
data(1,2) = thetaDot;
38+
data(1,3) = getReward([theta;thetaDot]);
3839

3940
%u = 6.6345;
40-
%deltaT = 0.1;
41+
42+
for i = 1:length(Policy)
43+
if theta == Policy(1,i) && thetaDot
44+
u =
45+
deltaT = 0.1;
4146

4247
%for i = 2:maxIterations
43-
%[theta, thetaDot] = simulateOneStep(theta, thetaDot, deltaT, u);
44-
%data(i,1) = theta;
45-
%data(i,2) = thetaDot;
48+
%data(i, 3) = getReward([theta;thetaDot]);
49+
%sPrime = simulateOneStep(theta, thetaDot, deltaT, u);
50+
%theta = sPrime(1,1);
51+
%thetaDot = sPrime(2,1);
52+
%data (i,1) = theta;
53+
%data(i,2) = thetaDot;
4654
%end
4755

48-
%%data
56+
%data
4957

50-
%setenv("GNUTERM","qt")
58+
%setenv("GNUTERM","qt");
5159
%figure('Position',[0,0,1300,700]);
52-
%plot(data)
53-
%pause()
60+
%h = plot(data, 'linewidth', 2);
61+
%set(gca, "linewidth", 4, "fontsize", 12)
62+
%title("Inverted Pendulum controlled with MDP");
63+
%legend('Theta', 'ThetaDot', 'Reward');
64+
%pause();

octave-workspace

11 Bytes
Binary file not shown.

simulateOneStep.m

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
function [thetaN,thetadotN] = simulateOneStep(theta,thetaDot,deltaT,u)
1+
function sPrime = simulateOneStep(theta,thetaDot,deltaT,u)
22
J = m = l = 1;
33
gamma = 0.1;
44
g = 9.81;
55
Jt = J + m * l ^ 2;
66

7+
% TODO remove this final result
78
if(theta > 0)
89
u = -u;
910
end
@@ -13,6 +14,7 @@
1314
thetadotN = thetaDot + deltaT * angularAcceleration; % v = v_0 + angularAccerlation * t
1415
thetaN = theta + thetadotN * deltaT;
1516

17+
% Make sure theta stays within allowable range
1618
while thetaN > 3.14
1719
thetaN -= 2 * 3.14;
1820
end
@@ -21,4 +23,6 @@
2123
thetaN += 2 * 3.14;
2224
end
2325

26+
sPrime = [thetaN; thetadotN];
27+
2428
end

simulateSequence.m

Lines changed: 0 additions & 8 deletions
This file was deleted.

transitionProbabilities.m

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@
5555
sum(d, 1) += 1 - sum(d, 1);
5656
end
5757

58+
[ThetaTransitions, ThetaDotTransitions] = meshgrid(T(1,:), T(2,:));
59+
T = [reshape(ThetaTransitions, 1, numel(ThetaTransitions));...
60+
reshape(ThetaDotTransitions, 1, numel(ThetaDotTransitions))];
61+
5862
end
5963

6064
function p = getLeftProbability(x, mu, variance)

0 commit comments

Comments
 (0)