ManuelMeraz
diff --git a/‎MDP.m
Lines changed: 157 additions & 0 deletions b/‎MDP.m
Lines changed: 157 additions & 0 deletions
diff --git a/‎VStar.m
Lines changed: 3 additions & 0 deletions b/‎VStar.m
Lines changed: 3 additions & 0 deletions
diff --git a/‎addNoise.m
Lines changed: 0 additions & 10 deletions b/‎addNoise.m
Lines changed: 0 additions & 10 deletions
diff --git a/‎getReward.m
Lines changed: 16 additions & 0 deletions b/‎getReward.m
Lines changed: 16 additions & 0 deletions
diff --git a/‎initializeMDP.m
Lines changed: 0 additions & 34 deletions b/‎initializeMDP.m
Lines changed: 0 additions & 34 deletions
diff --git a/‎main.m
Lines changed: 38 additions & 27 deletions b/‎main.m
Lines changed: 38 additions & 27 deletions
diff --git a/‎octave-workspace
11 Bytes b/‎octave-workspace
11 Bytes
diff --git a/‎simulateOneStep.m
Lines changed: 5 additions & 1 deletion b/‎simulateOneStep.m
Lines changed: 5 additions & 1 deletion
diff --git a/‎simulateSequence.m
Lines changed: 0 additions & 8 deletions b/‎simulateSequence.m
Lines changed: 0 additions & 8 deletions
diff --git a/‎transitionProbabilities.m
Lines changed: 4 additions & 0 deletions b/‎transitionProbabilities.m
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,157 @@
+function Policy = MDP(state, noise, S, A, dt)
+    % Returns the optimal policy of inverted pendulum 
+    % with given a set of states S, possible actions A
+    % O(A * S^2)
+
+    numStates = state.numStates;
+    discount = 0.9;
+
+    % Generate all possible state vectors
+    [Thetas, ThetaDots] = meshgrid(S(1,:), S(2,:));
+    vS = [reshape(Thetas, 1, numel(Thetas)); reshape(ThetaDots, 1, numel(ThetaDots))];
+
+    for i = 1:length(vS)
+       s = vS(:,i);
+       Policy(:,i) = s;
+       bestActions(:,i) = VStar(discount, state, noise, S, vS, A, dt, S(:,2));
+    end
+
+    Policy = [Policy; bestActions];
+
+end
+
+function a = VStar(discount, state, noise, S, vS, A, dt, s)
+    % Given a state s, compute the expection for every action for every
+    % possible future state. Return the max.
+
+    for i = 1:length(A)
+
+        % Commit to action a
+        a = A(1, i);
+        R(1, i) = a;
+        R(2,i) = 0;
+        depth = 0;
+
+        % Compute the next state for the given 
+        sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
+        sPrime = mapToDiscreteValue(S, sPrime);
+        T = transitionProbabilities(S, sPrime, state, noise);
+        for j = 1:1%length(vS)
+            sPrime = vS(:,j);
+            psPrime = getTransitionProbability(T, vS, sPrime);
+            % Bellman Equation. Sum of future rewards
+            if abs(sPrime(1,1)) >= 45
+                R(2,i) += psPrime * getReward(sPrime);
+            else
+                R(2,i)  += psPrime * (getReward(sPrime) + ...
+                discount * QStar(++depth, discount, state, noise, S, vS, A, dt, sPrime));
+            end
+        end
+    end
+
+    a = R(1,1);
+    maxIndex = 1;
+    for i = 2:length(R)
+        if R(2,i) > R(2, maxIndex)
+            a = R(1,i);
+            maxIndex = i;
+        end
+    end
+
+
+end
+
+
+
+function r = QStar(depth, discount, state, noise, S, vS, A, dt, s)
+    % Given a state and action compute the sum of the rewards
+    % for all future states
+    r = 0;
+    if depth >= 5
+        return;
+    end
+
+    for i = 1:length(A)
+        a = A(1, i);
+        sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
+        sPrime = mapToDiscreteValue(S, sPrime);
+        T = transitionProbabilities(S, sPrime, state, noise);
+
+        for j = 1:length(vS)
+            sPrime = vS(:,j);
+            psPrime = getTransitionProbability(T, vS, sPrime);
+            % Bellman Equation. Sum of future rewards
+            if abs(sPrime(1,1)) >= 45
+                r += psPrime * getReward(sPrime);
+            else
+                r += psPrime * (getReward(sPrime) + ...
+                discount * QStar(++depth, discount, state, noise, S, vS, A, dt, sPrime));
+            end
+        end
+    end
+end
+
+function ps = getTransitionProbability(T, S, sPrime) 
+   % Given a state and transition matrix, return the transition probability
+   % for that state 
+
+   for i = 1:length(S)
+       if sPrime(1,1) == S(1,i) && sPrime(2,1) == S(2,i)
+           ps = T(1,i) * T(2,i);
+       end
+   end
+
+end
+
+function sPrime = mapToDiscreteValue(S, sPrime)
+    % Map the real value given to us by the pendulum dynamics function
+    % to a discrete state
+
+    theta = sPrime(1,1);
+    thetaDot = sPrime(2,1);
+
+    if theta <= S(1,1)
+        theta = S(1,1);
+    elseif theta >= S(1, length(S))
+        theta = S(1,length(S));
+    else
+
+        for i = 1:length(S) - 1
+            if theta >= S(1,i) && theta <= S(1,i + 1)
+                left = theta - S(1,i);
+                right = S(1,i + 1) - theta;
+
+                if(left < right) 
+                    theta = S(1,i);
+                else
+                    theta = S(1,i+1);
+                end
+                break;
+            end
+        end
+    end
+
+    if thetaDot <= S(2,1)
+        thetaDot = S(2,1);
+    elseif thetaDot >= S(2, length(S))
+        thetaDot = S(2,length(S));
+    else
+
+        for i = 1:length(S) - 1
+            if thetaDot >= S(2,i) && thetaDot <= S(2,i + 1)
+                %left = thetaDot - S(2,i);
+                %right = S(2,i + 1) - thetaDot;
+
+                %if left < right
+                    %thetaDot = S(2,i);
+                %else
+                    %thetaDot = S(2,i+1);
+                %end
+                break;
+            end
+        end
+    end
+
+    sPrime = [theta;thetaDot];
+
+end
@@ -0,0 +1,3 @@
+function bestAction = vStar(s)
+    bestAction = 1
+end
@@ -0,0 +1,16 @@
+function r = getReward(s)
+    theta = s(1,1);
+    thetaDot = s(2,1);
+    r = 0;
+
+    if theta == 0
+        r = 2;
+    elseif theta > 0 && thetaDot < 0
+        r = 1;
+    elseif theta < 0 && thetaDot > 0
+        r = 1;
+    elseif theta > 45 || theta < -45
+        r = -10;
+    end
+    
+end
@@ -1,53 +1,64 @@
-deltaT = 0.1; % Seconds
-maxIterations = 500;
+% Time step size
+dt = 0.1; % Seconds
+maxIterations = 100;
 
+% Number of dimensions state vector is in
 dimensions = 2;
 
+% Noise contains all noise parameters
 noise.mu = zeros(dimensions, 1);
 noise.covariance = eye(dimensions) * 0.1;
 
+% State is a struct containing all state parameters
 state.stateBounds = [-pi/4, pi/4; -1, 1];
-state.numStates = 3;
+state.numStates = 10;
 
+% Calculates the step size between each upper and lower bound
 for dimension = 1:dimensions
     % Step size is (outer bound - inner bound) / number of states
     state.stepSize(dimension, 1) = (...
     (state.stateBounds(dimension, 2) - state.stateBounds(dimension, 1))/...
     state.numStates);
 end
 
+% Set of actions
+A = [-2, -1, 0, 1, 2];
 
-A = [-2, -1, 0, 1, 2]; % set of actions
+% Set of states
+S = [linspace(-pi/4, pi/4, state.numStates); linspace(-1, 1, state.numStates)];
 
+% Policy is of length numStates and contains the optimal action a 
+% in the corresponding column of state s in the set S
+Policy = MDP(state, noise, S, A, dt, dimensions)
 
-S = [linspace(-pi/4, pi/4, state.numStates); linspace(-1, 1, state.numStates)]
-sPrime = [pi/4; 1]
-
-transitionProbabilities(S, sPrime, state, noise)
-
-%[S, A, T, R] = initializeMDP(state, A, deltaT);
-
-discountFactor = 0.9; % discount 
-convergance = 0.001; % stop after gamma * Q*(s) == 0.001
-
-
-%theta = 0.1;
-%thetaDot = 0;
-%data(1,1) = theta;
-%data(1,2) = thetaDot;
+theta = 30;
+thetaDot = 0;
+data(1,1) = theta;
+data(1,2) = thetaDot;
+data(1,3) = getReward([theta;thetaDot]);
 
 %u = 6.6345;
-%deltaT = 0.1;
+
+for i = 1:length(Policy)
+    if theta == Policy(1,i) && thetaDot
+u = 
+deltaT = 0.1;
 
 %for i = 2:maxIterations
-%[theta, thetaDot] = simulateOneStep(theta, thetaDot, deltaT, u);
-%data(i,1) = theta;
-%data(i,2) = thetaDot;
+    %data(i, 3) = getReward([theta;thetaDot]);
+    %sPrime = simulateOneStep(theta, thetaDot, deltaT, u);
+    %theta = sPrime(1,1);
+    %thetaDot = sPrime(2,1);
+    %data (i,1) = theta;
+    %data(i,2) = thetaDot;
 %end
 
-%%data
+%data
 
-%setenv("GNUTERM","qt")
+%setenv("GNUTERM","qt");
 %figure('Position',[0,0,1300,700]);
-%plot(data)
-%pause()
+%h = plot(data, 'linewidth', 2);
+%set(gca, "linewidth", 4, "fontsize", 12)
+%title("Inverted Pendulum controlled with MDP");
+%legend('Theta', 'ThetaDot', 'Reward');
+%pause();
@@ -1,9 +1,10 @@
-function [thetaN,thetadotN] = simulateOneStep(theta,thetaDot,deltaT,u)
+function sPrime = simulateOneStep(theta,thetaDot,deltaT,u)
     J =  m = l = 1;
     gamma = 0.1;
     g = 9.81;
     Jt = J + m * l ^ 2;
 
+    % TODO remove this final result
     if(theta > 0)
         u = -u;
     end
@@ -13,6 +14,7 @@
     thetadotN = thetaDot + deltaT * angularAcceleration; % v = v_0 + angularAccerlation * t
     thetaN = theta + thetadotN * deltaT;
 
+    % Make sure theta stays within allowable range
     while thetaN > 3.14
         thetaN -= 2 * 3.14;
     end
@@ -21,4 +23,6 @@
         thetaN += 2 * 3.14;
     end
 
+    sPrime = [thetaN; thetadotN];
+
 end
@@ -55,6 +55,10 @@
         sum(d, 1) += 1 - sum(d, 1);
     end
 
+    [ThetaTransitions, ThetaDotTransitions] = meshgrid(T(1,:), T(2,:));
+    T = [reshape(ThetaTransitions, 1, numel(ThetaTransitions));...
+    reshape(ThetaDotTransitions, 1, numel(ThetaDotTransitions))];
+
 end
 
 function p = getLeftProbability(x, mu, variance)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+function bestAction = vStar(s)`
	`2`	`+ bestAction = 1`
	`3`	`+end`