|
| 1 | +####################################################################### |
| 2 | +# Copyright (C) # |
| 3 | +# 2016-2018 Shangtong Zhang([email protected]) # |
| 4 | +# 2016 Kenta Shimada([email protected]) # |
| 5 | +# Permission given to modify the code as long as you keep this # |
| 6 | +# declaration at the top # |
| 7 | +####################################################################### |
| 8 | + |
| 9 | +import numpy as np |
| 10 | +import matplotlib |
| 11 | +matplotlib.use('Agg') |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +from tqdm import tqdm |
| 14 | + |
| 15 | +# world height |
| 16 | +WORLD_HEIGHT = 4 |
| 17 | + |
| 18 | +# world width |
| 19 | +WORLD_WIDTH = 12 |
| 20 | + |
| 21 | +# probability for exploration |
| 22 | +EPSILON = 0.1 |
| 23 | + |
| 24 | +# step size |
| 25 | +ALPHA = 0.5 |
| 26 | + |
| 27 | +# gamma for Q-Learning and Expected Sarsa |
| 28 | +GAMMA = 1 |
| 29 | + |
| 30 | +# all possible actions |
| 31 | +ACTION_UP = 0 |
| 32 | +ACTION_DOWN = 1 |
| 33 | +ACTION_LEFT = 2 |
| 34 | +ACTION_RIGHT = 3 |
| 35 | +ACTIONS = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT] |
| 36 | + |
| 37 | +# initial state action pair values |
| 38 | +START = [3, 0] |
| 39 | +GOAL = [3, 11] |
| 40 | + |
| 41 | +def step(state, action): |
| 42 | + i, j = state |
| 43 | + if action == ACTION_UP: |
| 44 | + next_state = [max(i - 1, 0), j] |
| 45 | + elif action == ACTION_LEFT: |
| 46 | + next_state = [i, max(j - 1, 0)] |
| 47 | + elif action == ACTION_RIGHT: |
| 48 | + next_state = [i, min(j + 1, WORLD_WIDTH - 1)] |
| 49 | + elif action == ACTION_DOWN: |
| 50 | + next_state = [min(i + 1, WORLD_HEIGHT - 1), j] |
| 51 | + else: |
| 52 | + assert False |
| 53 | + |
| 54 | + reward = -1 |
| 55 | + if (action == ACTION_DOWN and i == 2 and 1 <= j <= 10) or ( |
| 56 | + action == ACTION_RIGHT and state == START): |
| 57 | + reward = -100 |
| 58 | + next_state = START |
| 59 | + |
| 60 | + return next_state, reward |
| 61 | + |
| 62 | +# reward for each action in each state |
| 63 | +# actionRewards = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4)) |
| 64 | +# actionRewards[:, :, :] = -1.0 |
| 65 | +# actionRewards[2, 1:11, ACTION_DOWN] = -100.0 |
| 66 | +# actionRewards[3, 0, ACTION_RIGHT] = -100.0 |
| 67 | + |
| 68 | +# set up destinations for each action in each state |
| 69 | +# actionDestination = [] |
| 70 | +# for i in range(0, WORLD_HEIGHT): |
| 71 | +# actionDestination.append([]) |
| 72 | +# for j in range(0, WORLD_WIDTH): |
| 73 | +# destinaion = dict() |
| 74 | +# destinaion[ACTION_UP] = [max(i - 1, 0), j] |
| 75 | +# destinaion[ACTION_LEFT] = [i, max(j - 1, 0)] |
| 76 | +# destinaion[ACTION_RIGHT] = [i, min(j + 1, WORLD_WIDTH - 1)] |
| 77 | +# if i == 2 and 1 <= j <= 10: |
| 78 | +# destinaion[ACTION_DOWN] = START |
| 79 | +# else: |
| 80 | +# destinaion[ACTION_DOWN] = [min(i + 1, WORLD_HEIGHT - 1), j] |
| 81 | +# actionDestination[-1].append(destinaion) |
| 82 | +# actionDestination[3][0][ACTION_RIGHT] = START |
| 83 | + |
| 84 | +# choose an action based on epsilon greedy algorithm |
| 85 | +def choose_action(state, q_value): |
| 86 | + if np.random.binomial(1, EPSILON) == 1: |
| 87 | + return np.random.choice(ACTIONS) |
| 88 | + else: |
| 89 | + values_ = q_value[state[0], state[1], :] |
| 90 | + return np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)]) |
| 91 | + |
| 92 | +# an episode with Sarsa |
| 93 | +# @q_value: values for state action pair, will be updated |
| 94 | +# @expected: if True, will use expected Sarsa algorithm |
| 95 | +# @step_size: step size for updating |
| 96 | +# @return: total rewards within this episode |
| 97 | +def sarsa(q_value, expected=False, step_size=ALPHA): |
| 98 | + state = START |
| 99 | + action = choose_action(state, q_value) |
| 100 | + rewards = 0.0 |
| 101 | + while state != GOAL: |
| 102 | + next_state, reward = step(state, action) |
| 103 | + next_action = choose_action(next_state, q_value) |
| 104 | + rewards += reward |
| 105 | + if not expected: |
| 106 | + target = q_value[next_state[0], next_state[1], next_action] |
| 107 | + else: |
| 108 | + # calculate the expected value of new state |
| 109 | + target = 0.0 |
| 110 | + q_next = q_value[next_state[0], next_state[1], :] |
| 111 | + best_actions = np.argwhere(q_next == np.max(q_next)) |
| 112 | + for action_ in ACTIONS: |
| 113 | + if action_ in best_actions: |
| 114 | + target += ((1.0 - EPSILON) / len(best_actions) + EPSILON / len(ACTIONS)) * q_value[next_state[0], next_state[1], action_] |
| 115 | + else: |
| 116 | + target += EPSILON / len(ACTIONS) * q_value[next_state[0], next_state[1], action_] |
| 117 | + target *= GAMMA |
| 118 | + q_value[state[0], state[1], action] += step_size * ( |
| 119 | + reward + target - q_value[state[0], state[1], action]) |
| 120 | + state = next_state |
| 121 | + action = next_action |
| 122 | + return rewards |
| 123 | + |
| 124 | +# an episode with Q-Learning |
| 125 | +# @q_value: values for state action pair, will be updated |
| 126 | +# @step_size: step size for updating |
| 127 | +# @return: total rewards within this episode |
| 128 | +def q_learning(q_value, step_size=ALPHA): |
| 129 | + state = START |
| 130 | + rewards = 0.0 |
| 131 | + while state != GOAL: |
| 132 | + action = choose_action(state, q_value) |
| 133 | + next_state, reward = step(state, action) |
| 134 | + rewards += reward |
| 135 | + # Q-Learning update |
| 136 | + q_value[state[0], state[1], action] += step_size * ( |
| 137 | + reward + GAMMA * np.max(q_value[next_state[0], next_state[1], :]) - |
| 138 | + q_value[state[0], state[1], action]) |
| 139 | + state = next_state |
| 140 | + return rewards |
| 141 | + |
| 142 | +# print optimal policy |
| 143 | +def print_optimal_policy(q_value): |
| 144 | + optimal_policy = [] |
| 145 | + for i in range(0, WORLD_HEIGHT): |
| 146 | + optimal_policy.append([]) |
| 147 | + for j in range(0, WORLD_WIDTH): |
| 148 | + if [i, j] == GOAL: |
| 149 | + optimal_policy[-1].append('G') |
| 150 | + continue |
| 151 | + bestAction = np.argmax(q_value[i, j, :]) |
| 152 | + if bestAction == ACTION_UP: |
| 153 | + optimal_policy[-1].append('U') |
| 154 | + elif bestAction == ACTION_DOWN: |
| 155 | + optimal_policy[-1].append('D') |
| 156 | + elif bestAction == ACTION_LEFT: |
| 157 | + optimal_policy[-1].append('L') |
| 158 | + elif bestAction == ACTION_RIGHT: |
| 159 | + optimal_policy[-1].append('R') |
| 160 | + for row in optimal_policy: |
| 161 | + print(row) |
| 162 | + |
| 163 | +# Use multiple runs instead of a single run and a sliding window |
| 164 | +# With a single run I failed to present a smooth curve |
| 165 | +# However the optimal policy converges well with a single run |
| 166 | +# Sarsa converges to the safe path, while Q-Learning converges to the optimal path |
| 167 | +def figure_6_4(): |
| 168 | + # episodes of each run |
| 169 | + episodes = 500 |
| 170 | + |
| 171 | + # perform 40 independent runs |
| 172 | + runs = 50 |
| 173 | + |
| 174 | + rewards_sarsa = np.zeros(episodes) |
| 175 | + rewards_q_learning = np.zeros(episodes) |
| 176 | + for r in tqdm(range(runs)): |
| 177 | + q_sarsa = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4)) |
| 178 | + q_q_learning = np.copy(q_sarsa) |
| 179 | + for i in range(0, episodes): |
| 180 | + # cut off the value by -100 to draw the figure more elegantly |
| 181 | + # rewards_sarsa[i] += max(sarsa(q_sarsa), -100) |
| 182 | + # rewards_q_learning[i] += max(q_learning(q_q_learning), -100) |
| 183 | + rewards_sarsa[i] += sarsa(q_sarsa) |
| 184 | + rewards_q_learning[i] += q_learning(q_q_learning) |
| 185 | + |
| 186 | + # averaging over independt runs |
| 187 | + rewards_sarsa /= runs |
| 188 | + rewards_q_learning /= runs |
| 189 | + |
| 190 | + # draw reward curves |
| 191 | + plt.plot(rewards_sarsa, label='Sarsa') |
| 192 | + plt.plot(rewards_q_learning, label='Q-Learning') |
| 193 | + plt.xlabel('Episodes') |
| 194 | + plt.ylabel('Sum of rewards during episode') |
| 195 | + plt.ylim([-100, 0]) |
| 196 | + plt.legend() |
| 197 | + |
| 198 | + plt.savefig('../images/figure_6_4.png') |
| 199 | + plt.close() |
| 200 | + |
| 201 | + # display optimal policy |
| 202 | + print('Sarsa Optimal Policy:') |
| 203 | + print_optimal_policy(q_sarsa) |
| 204 | + print('Q-Learning Optimal Policy:') |
| 205 | + print_optimal_policy(q_q_learning) |
| 206 | + |
| 207 | +# Due to limited capacity of calculation of my machine, I can't complete this experiment |
| 208 | +# with 100,000 episodes and 50,000 runs to get the fully averaged performance |
| 209 | +# However even I only play for 1,000 episodes and 10 runs, the curves looks still good. |
| 210 | +def figure_6_6(): |
| 211 | + step_sizes = np.arange(0.1, 1.1, 0.1) |
| 212 | + episodes = 1000 |
| 213 | + runs = 10 |
| 214 | + |
| 215 | + ASY_SARSA = 0 |
| 216 | + ASY_EXPECTED_SARSA = 1 |
| 217 | + ASY_QLEARNING = 2 |
| 218 | + INT_SARSA = 3 |
| 219 | + INT_EXPECTED_SARSA = 4 |
| 220 | + INT_QLEARNING = 5 |
| 221 | + methods = range(0, 6) |
| 222 | + |
| 223 | + performace = np.zeros((6, len(step_sizes))) |
| 224 | + for run in range(runs): |
| 225 | + for ind, step_size in tqdm(list(zip(range(0, len(step_sizes)), step_sizes))): |
| 226 | + q_sarsa = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4)) |
| 227 | + q_expected_sarsa = np.copy(q_sarsa) |
| 228 | + q_q_learning = np.copy(q_sarsa) |
| 229 | + for ep in range(episodes): |
| 230 | + sarsa_reward = sarsa(q_sarsa, expected=False, step_size=step_size) |
| 231 | + expected_sarsa_reward = sarsa(q_expected_sarsa, expected=True, step_size=step_size) |
| 232 | + q_learning_reward = q_learning(q_q_learning, step_size=step_size) |
| 233 | + performace[ASY_SARSA, ind] += sarsa_reward |
| 234 | + performace[ASY_EXPECTED_SARSA, ind] += expected_sarsa_reward |
| 235 | + performace[ASY_QLEARNING, ind] += q_learning_reward |
| 236 | + |
| 237 | + if ep < 100: |
| 238 | + performace[INT_SARSA, ind] += sarsa_reward |
| 239 | + performace[INT_EXPECTED_SARSA, ind] += expected_sarsa_reward |
| 240 | + performace[INT_QLEARNING, ind] += q_learning_reward |
| 241 | + |
| 242 | + performace[:3, :] /= episodes * runs |
| 243 | + performace[3:, :] /= 100 * runs |
| 244 | + labels = ['Asymptotic Sarsa', 'Asymptotic Expected Sarsa', 'Asymptotic Q-Learning', |
| 245 | + 'Interim Sarsa', 'Interim Expected Sarsa', 'Interim Q-Learning'] |
| 246 | + |
| 247 | + for method, label in zip(methods, labels): |
| 248 | + plt.plot(step_sizes, performace[method, :], label=label) |
| 249 | + plt.xlabel('alpha') |
| 250 | + plt.ylabel('reward per episode') |
| 251 | + plt.legend() |
| 252 | + |
| 253 | + plt.savefig('../images/figure_6_6.png') |
| 254 | + plt.close() |
| 255 | + |
| 256 | +if __name__ == '__main__': |
| 257 | + figure_6_4() |
| 258 | + figure_6_6() |
0 commit comments