Untitled


import numpy as np
import gymnasium as gym
import torch
from torch import nn
import matplotlib.pyplot as plt
env = gym.make("FrozenLake-v1")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
learning_rate = 0.0001
episodes = 100000
gamma = 0.99

def discount_rewards(reward, gamma = 0.99):
    return torch.pow(gamma, torch.arange(len(reward)))*reward
def normalize_rewards(disc_reward):
    if disc_reward.max()==0:
        return disc_reward / (disc_reward.max()+0.00001)

    return disc_reward/(disc_reward.max())

class NeuralNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(NeuralNetwork, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(state_size, 10),
            nn.ReLU(),
            nn.Linear(10, action_size),
            nn.Softmax()
        )

    def forward(self,x):
        x = self.linear_relu_stack(x)
        return x

model = NeuralNetwork(1, env.action_space.n).to(device)
opt = torch.optim.Adam(params = model.parameters(), lr = learning_rate)
score = []
for i in range(episodes):
    print("i = ", i)
    state = np.array([env.reset()[0]])
    done = False
    transitions = []

    tot_rewards = 0
    iter = 0

    while not done:
        act_proba = model(torch.from_numpy(state).float().to(device))

        action = np.random.choice(np.array([0,1,2,3]), p = act_proba.cpu().data.numpy())
        next_state, reward, done, _, _ = env.step(action)
        tot_rewards += np.power(gamma, iter) * reward
        transitions.append((state, action, tot_rewards))
        state = np.array([next_state])

        iter += 1


    if i%50==0:
        print("i = ", i, ",steps = ", iter)
    score.append(iter)
    reward_batch = torch.Tensor([r for (s,a,r) in transitions]).flip(dims = (0,))
    nrml_disc_rewards = normalize_rewards(reward_batch).to(device)
    state_batch = torch.Tensor([s for (s,a,r) in transitions])
    action_batch = torch.Tensor([a for (s,a,r) in transitions]).to(device)
    pred_batch = model(state_batch.to(device))
    # print("pred_batch ", pred_batch)
    prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
    # print("prob_batch = ", prob_batch)
    loss = -(torch.sum(torch.log(prob_batch)*nrml_disc_rewards))
    opt.zero_grad()
    loss.backward()
    opt.step()

plt.scatter(np.arange(len(score)), score)
plt.show()
Editor is loading...