Untitled

print(env1.observation_space)
print(env1.action_space)

print(env1.P[14][2])
env1.reset()

t_s = 10
eps = 10

total_rewards = []  # List to store total rewards for each episode
episode_policies = []  # List to store policies for each episode

# Q1: Generate multiple episodes and find the total reward in each episode
for j in range(eps):
    env1.reset()
    print("\nEpisode: ", j + 1)
    
    total_reward = 0  # Initialize total reward for the current episode
    episode_policy = []  # List to store actions taken in the current episode

    for i in range(t_s):
        print("Time step: ", i + 1)
        random_action = env1.action_space.sample()
        
        n_s, reward, done, info, trans_prob = env1.step(random_action)
        env1.render()
        print(n_s, reward, done, info, trans_prob)

        # Q2: Append the action to the episode_policy
        episode_policy.append(random_action)

        total_reward += reward  # Add the reward to the total_reward for the episode

        if done:
            break

    # Append the total_reward and episode_policy to their respective lists
    total_rewards.append(total_reward)
    episode_policies.append(episode_policy)

    print("Total Reward for Episode {}: {}".format(j + 1, total_reward))

# Q1: Print the total reward for each episode in the specified format
print("\n[Episode  -  Total reward]")
for episode, reward in enumerate(total_rewards, 1):
    print("[{:8} - {:8}]".format(episode, reward))

# Q2: Find the episode with the highest reward and print the policy
best_episode_index = total_rewards.index(max(total_rewards))
best_episode_policy = episode_policies[best_episode_index]

# Print the policy for the episode with the highest reward
print("\nPolicy for Episode with Highest Reward (Episode {}):".format(best_episode_index + 1))
for step, action in enumerate(best_episode_policy):
    print("Step {}: {}".format(step + 1, action))
Editor is loading...