CartPole

mail@pastecode.io avatar
unknown
python
2 years ago
1.5 kB
8
Indexable
import numpy as np
import math
import random
import gym

env = gym.make("CartPole-v0")

buckets = (1, 1, 6, 12)
Q_Table = np.zeros(buckets + (2,))

min_epsilon = 0.02
epsilon = 1
lr = 0.01
gamma = 0.99
episodes = 100000
max_iterations = 250

rewards = []

def discretize_state(state):
    upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50) / 1.]
    lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50) / 1.]
    ratios = [(state[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(state))]
    state_ = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(state))]
    state_ = [min(buckets[i] - 1, max(0, state_[i])) for i in range(len(state))]
    return tuple(state_) 

for i in range(episodes):
    state = discretize_state(env.reset())
    total_reward = 0
    render = i % 2000 == 0
    for j in range(max_iterations):
        if render:
            env.render()
        action = 0
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_Table[state])
        new_state, reward, done, _ = env.step(action)
        new_state = discretize_state(new_state)
        Q_Table[state][action] += lr * (reward + gamma * np.max(Q_Table[new_state]) - Q_Table[state][action])
        total_reward += reward
        if done: break
        state = new_state
    epsilon = max(min_epsilon, min(1, 1 - np.log10((i + 1) / 100)))