Benlau93
diff --git a/‎deep-learning/Deep-Reinforcement-Learning-Complete-Collection/DeepRL-Code/chapter06/cliff_walking.py
Lines changed: 258 additions & 0 deletions b/‎deep-learning/Deep-Reinforcement-Learning-Complete-Collection/DeepRL-Code/chapter06/cliff_walking.py
Lines changed: 258 additions & 0 deletions
diff --git a/‎deep-learning/Deep-Reinforcement-Learning-Complete-Collection/DeepRL-Code/chapter06/maximization_bias.py
Lines changed: 130 additions & 0 deletions b/‎deep-learning/Deep-Reinforcement-Learning-Complete-Collection/DeepRL-Code/chapter06/maximization_bias.py
Lines changed: 130 additions & 0 deletions
@@ -0,0 +1,258 @@
+#######################################################################
+# Copyright (C)                                                       #
+# 2016-2018 Shangtong Zhang([email protected])             #
+# 2016 Kenta Shimada([email protected])                         #
+# Permission given to modify the code as long as you keep this        #
+# declaration at the top                                              #
+#######################################################################
+
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+# world height
+WORLD_HEIGHT = 4
+
+# world width
+WORLD_WIDTH = 12
+
+# probability for exploration
+EPSILON = 0.1
+
+# step size
+ALPHA = 0.5
+
+# gamma for Q-Learning and Expected Sarsa
+GAMMA = 1
+
+# all possible actions
+ACTION_UP = 0
+ACTION_DOWN = 1
+ACTION_LEFT = 2
+ACTION_RIGHT = 3
+ACTIONS = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT]
+
+# initial state action pair values
+START = [3, 0]
+GOAL = [3, 11]
+
+def step(state, action):
+    i, j = state
+    if action == ACTION_UP:
+        next_state = [max(i - 1, 0), j]
+    elif action == ACTION_LEFT:
+        next_state = [i, max(j - 1, 0)]
+    elif action == ACTION_RIGHT:
+        next_state = [i, min(j + 1, WORLD_WIDTH - 1)]
+    elif action == ACTION_DOWN:
+        next_state = [min(i + 1, WORLD_HEIGHT - 1), j]
+    else:
+        assert False
+
+    reward = -1
+    if (action == ACTION_DOWN and i == 2 and 1 <= j <= 10) or (
+        action == ACTION_RIGHT and state == START):
+        reward = -100
+        next_state = START
+
+    return next_state, reward
+
+# reward for each action in each state
+# actionRewards = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
+# actionRewards[:, :, :] = -1.0
+# actionRewards[2, 1:11, ACTION_DOWN] = -100.0
+# actionRewards[3, 0, ACTION_RIGHT] = -100.0
+
+# set up destinations for each action in each state
+# actionDestination = []
+# for i in range(0, WORLD_HEIGHT):
+#     actionDestination.append([])
+#     for j in range(0, WORLD_WIDTH):
+#         destinaion = dict()
+#         destinaion[ACTION_UP] = [max(i - 1, 0), j]
+#         destinaion[ACTION_LEFT] = [i, max(j - 1, 0)]
+#         destinaion[ACTION_RIGHT] = [i, min(j + 1, WORLD_WIDTH - 1)]
+#         if i == 2 and 1 <= j <= 10:
+#             destinaion[ACTION_DOWN] = START
+#         else:
+#             destinaion[ACTION_DOWN] = [min(i + 1, WORLD_HEIGHT - 1), j]
+#         actionDestination[-1].append(destinaion)
+# actionDestination[3][0][ACTION_RIGHT] = START
+
+# choose an action based on epsilon greedy algorithm
+def choose_action(state, q_value):
+    if np.random.binomial(1, EPSILON) == 1:
+        return np.random.choice(ACTIONS)
+    else:
+        values_ = q_value[state[0], state[1], :]
+        return np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])
+
+# an episode with Sarsa
+# @q_value: values for state action pair, will be updated
+# @expected: if True, will use expected Sarsa algorithm
+# @step_size: step size for updating
+# @return: total rewards within this episode
+def sarsa(q_value, expected=False, step_size=ALPHA):
+    state = START
+    action = choose_action(state, q_value)
+    rewards = 0.0
+    while state != GOAL:
+        next_state, reward = step(state, action)
+        next_action = choose_action(next_state, q_value)
+        rewards += reward
+        if not expected:
+            target = q_value[next_state[0], next_state[1], next_action]
+        else:
+            # calculate the expected value of new state
+            target = 0.0
+            q_next = q_value[next_state[0], next_state[1], :]
+            best_actions = np.argwhere(q_next == np.max(q_next))
+            for action_ in ACTIONS:
+                if action_ in best_actions:
+                    target += ((1.0 - EPSILON) / len(best_actions) + EPSILON / len(ACTIONS)) * q_value[next_state[0], next_state[1], action_]
+                else:
+                    target += EPSILON / len(ACTIONS) * q_value[next_state[0], next_state[1], action_]
+        target *= GAMMA
+        q_value[state[0], state[1], action] += step_size * (
+                reward + target - q_value[state[0], state[1], action])
+        state = next_state
+        action = next_action
+    return rewards
+
+# an episode with Q-Learning
+# @q_value: values for state action pair, will be updated
+# @step_size: step size for updating
+# @return: total rewards within this episode
+def q_learning(q_value, step_size=ALPHA):
+    state = START
+    rewards = 0.0
+    while state != GOAL:
+        action = choose_action(state, q_value)
+        next_state, reward = step(state, action)
+        rewards += reward
+        # Q-Learning update
+        q_value[state[0], state[1], action] += step_size * (
+                reward + GAMMA * np.max(q_value[next_state[0], next_state[1], :]) -
+                q_value[state[0], state[1], action])
+        state = next_state
+    return rewards
+
+# print optimal policy
+def print_optimal_policy(q_value):
+    optimal_policy = []
+    for i in range(0, WORLD_HEIGHT):
+        optimal_policy.append([])
+        for j in range(0, WORLD_WIDTH):
+            if [i, j] == GOAL:
+                optimal_policy[-1].append('G')
+                continue
+            bestAction = np.argmax(q_value[i, j, :])
+            if bestAction == ACTION_UP:
+                optimal_policy[-1].append('U')
+            elif bestAction == ACTION_DOWN:
+                optimal_policy[-1].append('D')
+            elif bestAction == ACTION_LEFT:
+                optimal_policy[-1].append('L')
+            elif bestAction == ACTION_RIGHT:
+                optimal_policy[-1].append('R')
+    for row in optimal_policy:
+        print(row)
+
+# Use multiple runs instead of a single run and a sliding window
+# With a single run I failed to present a smooth curve
+# However the optimal policy converges well with a single run
+# Sarsa converges to the safe path, while Q-Learning converges to the optimal path
+def figure_6_4():
+    # episodes of each run
+    episodes = 500
+
+    # perform 40 independent runs
+    runs = 50
+
+    rewards_sarsa = np.zeros(episodes)
+    rewards_q_learning = np.zeros(episodes)
+    for r in tqdm(range(runs)):
+        q_sarsa = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
+        q_q_learning = np.copy(q_sarsa)
+        for i in range(0, episodes):
+            # cut off the value by -100 to draw the figure more elegantly
+            # rewards_sarsa[i] += max(sarsa(q_sarsa), -100)
+            # rewards_q_learning[i] += max(q_learning(q_q_learning), -100)
+            rewards_sarsa[i] += sarsa(q_sarsa)
+            rewards_q_learning[i] += q_learning(q_q_learning)
+
+    # averaging over independt runs
+    rewards_sarsa /= runs
+    rewards_q_learning /= runs
+
+    # draw reward curves
+    plt.plot(rewards_sarsa, label='Sarsa')
+    plt.plot(rewards_q_learning, label='Q-Learning')
+    plt.xlabel('Episodes')
+    plt.ylabel('Sum of rewards during episode')
+    plt.ylim([-100, 0])
+    plt.legend()
+
+    plt.savefig('../images/figure_6_4.png')
+    plt.close()
+
+    # display optimal policy
+    print('Sarsa Optimal Policy:')
+    print_optimal_policy(q_sarsa)
+    print('Q-Learning Optimal Policy:')
+    print_optimal_policy(q_q_learning)
+
+# Due to limited capacity of calculation of my machine, I can't complete this experiment
+# with 100,000 episodes and 50,000 runs to get the fully averaged performance
+# However even I only play for 1,000 episodes and 10 runs, the curves looks still good.
+def figure_6_6():
+    step_sizes = np.arange(0.1, 1.1, 0.1)
+    episodes = 1000
+    runs = 10
+
+    ASY_SARSA = 0
+    ASY_EXPECTED_SARSA = 1
+    ASY_QLEARNING = 2
+    INT_SARSA = 3
+    INT_EXPECTED_SARSA = 4
+    INT_QLEARNING = 5
+    methods = range(0, 6)
+
+    performace = np.zeros((6, len(step_sizes)))
+    for run in range(runs):
+        for ind, step_size in tqdm(list(zip(range(0, len(step_sizes)), step_sizes))):
+            q_sarsa = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
+            q_expected_sarsa = np.copy(q_sarsa)
+            q_q_learning = np.copy(q_sarsa)
+            for ep in range(episodes):
+                sarsa_reward = sarsa(q_sarsa, expected=False, step_size=step_size)
+                expected_sarsa_reward = sarsa(q_expected_sarsa, expected=True, step_size=step_size)
+                q_learning_reward = q_learning(q_q_learning, step_size=step_size)
+                performace[ASY_SARSA, ind] += sarsa_reward
+                performace[ASY_EXPECTED_SARSA, ind] += expected_sarsa_reward
+                performace[ASY_QLEARNING, ind] += q_learning_reward
+
+                if ep < 100:
+                    performace[INT_SARSA, ind] += sarsa_reward
+                    performace[INT_EXPECTED_SARSA, ind] += expected_sarsa_reward
+                    performace[INT_QLEARNING, ind] += q_learning_reward
+
+    performace[:3, :] /= episodes * runs
+    performace[3:, :] /= 100 * runs
+    labels = ['Asymptotic Sarsa', 'Asymptotic Expected Sarsa', 'Asymptotic Q-Learning',
+              'Interim Sarsa', 'Interim Expected Sarsa', 'Interim Q-Learning']
+
+    for method, label in zip(methods, labels):
+        plt.plot(step_sizes, performace[method, :], label=label)
+    plt.xlabel('alpha')
+    plt.ylabel('reward per episode')
+    plt.legend()
+
+    plt.savefig('../images/figure_6_6.png')
+    plt.close()
+
+if __name__ == '__main__':
+    figure_6_4()
+    figure_6_6()
@@ -0,0 +1,130 @@
+#######################################################################
+# Copyright (C)                                                       #
+# 2016-2018 Shangtong Zhang([email protected])             #
+# 2016 Kenta Shimada([email protected])                         #
+# Permission given to modify the code as long as you keep this        #
+# declaration at the top                                              #
+#######################################################################
+
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+import copy
+
+# state A
+STATE_A = 0
+
+# state B
+STATE_B = 1
+
+# use one terminal state
+STATE_TERMINAL = 2
+
+# starts from state A
+STATE_START = STATE_A
+
+# possible actions in A
+ACTION_A_RIGHT = 0
+ACTION_A_LEFT = 1
+
+# probability for exploration
+EPSILON = 0.1
+
+# step size
+ALPHA = 0.1
+
+# discount for max value
+GAMMA = 1.0
+
+# possible actions in B, maybe 10 actions
+ACTIONS_B = range(0, 10)
+
+# all possible actions
+STATE_ACTIONS = [[ACTION_A_RIGHT, ACTION_A_LEFT], ACTIONS_B]
+
+# state action pair values, if a state is a terminal state, then the value is always 0
+INITIAL_Q = [np.zeros(2), np.zeros(len(ACTIONS_B)), np.zeros(1)]
+
+# set up destination for each state and each action
+TRANSITION = [[STATE_TERMINAL, STATE_B], [STATE_TERMINAL] * len(ACTIONS_B)]
+
+# choose an action based on epsilon greedy algorithm
+def choose_action(state, q_value):
+    if np.random.binomial(1, EPSILON) == 1:
+        return np.random.choice(STATE_ACTIONS[state])
+    else:
+        values_ = q_value[state]
+        return np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])
+
+# take @action in @state, return the reward
+def take_action(state, action):
+    if state == STATE_A:
+        return 0
+    return np.random.normal(-0.1, 1)
+
+# if there are two state action pair value array, use double Q-Learning
+# otherwise use normal Q-Learning
+def q_learning(q1, q2=None):
+    state = STATE_START
+    # track the # of action left in state A
+    left_count = 0
+    while state != STATE_TERMINAL:
+        if q2 is None:
+            action = choose_action(state, q1)
+        else:
+            # derive a action form Q1 and Q2
+            action = choose_action(state, [item1 + item2 for item1, item2 in zip(q1, q2)])
+        if state == STATE_A and action == ACTION_A_LEFT:
+            left_count += 1
+        reward = take_action(state, action)
+        next_state = TRANSITION[state][action]
+        if q2 is None:
+            active_q = q1
+            target = np.max(active_q[next_state])
+        else:
+            if np.random.binomial(1, 0.5) == 1:
+                active_q = q1
+                target_q = q2
+            else:
+                active_q = q2
+                target_q = q1
+            best_action = np.random.choice([action_ for action_, value_ in enumerate(active_q[next_state]) if value_ == np.max(active_q[next_state])])
+            target = target_q[next_state][best_action]
+
+        # Q-Learning update
+        active_q[state][action] += ALPHA * (
+            reward + GAMMA * target - active_q[state][action])
+        state = next_state
+    return left_count
+
+# Figure 6.7, 1,000 runs may be enough, # of actions in state B will also affect the curves
+def figure_6_7():
+    # each independent run has 300 episodes
+    episodes = 300
+    runs = 1000
+    left_counts_q = np.zeros((runs, episodes))
+    left_counts_double_q = np.zeros((runs, episodes))
+    for run in tqdm(range(runs)):
+        q = copy.deepcopy(INITIAL_Q)
+        q1 = copy.deepcopy(INITIAL_Q)
+        q2 = copy.deepcopy(INITIAL_Q)
+        for ep in range(0, episodes):
+            left_counts_q[run, ep] = q_learning(q)
+            left_counts_double_q[run, ep] = q_learning(q1, q2)
+    left_counts_q = left_counts_q.mean(axis=0)
+    left_counts_double_q = left_counts_double_q.mean(axis=0)
+
+    plt.plot(left_counts_q, label='Q-Learning')
+    plt.plot(left_counts_double_q, label='Double Q-Learning')
+    plt.plot(np.ones(episodes) * 0.05, label='Optimal')
+    plt.xlabel('episodes')
+    plt.ylabel('% left actions from A')
+    plt.legend()
+
+    plt.savefig('../images/figure_6_7.png')
+    plt.close()
+
+if __name__ == '__main__':
+    figure_6_7()