import numpy as np
import math
import random
import gym
env = gym.make("CartPole-v0")
buckets = (1, 1, 6, 12)
Q_Table = np.zeros(buckets + (2,))
min_epsilon = 0.02
epsilon = 1
lr = 0.01
gamma = 0.99
episodes = 100000
max_iterations = 250
rewards = []
def discretize_state(state):
upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50) / 1.]
lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50) / 1.]
ratios = [(state[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(state))]
state_ = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(state))]
state_ = [min(buckets[i] - 1, max(0, state_[i])) for i in range(len(state))]
return tuple(state_)
for i in range(episodes):
state = discretize_state(env.reset())
total_reward = 0
render = i % 2000 == 0
for j in range(max_iterations):
if render:
env.render()
action = 0
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q_Table[state])
new_state, reward, done, _ = env.step(action)
new_state = discretize_state(new_state)
Q_Table[state][action] += lr * (reward + gamma * np.max(Q_Table[new_state]) - Q_Table[state][action])
total_reward += reward
if done: break
state = new_state
epsilon = max(min_epsilon, min(1, 1 - np.log10((i + 1) / 100)))