Untitled
unknown
plain_text
a year ago
2.0 kB
3
Indexable
print(env1.observation_space) print(env1.action_space) print(env1.P[14][2]) env1.reset() t_s = 10 eps = 10 total_rewards = [] # List to store total rewards for each episode episode_policies = [] # List to store policies for each episode # Q1: Generate multiple episodes and find the total reward in each episode for j in range(eps): env1.reset() print("\nEpisode: ", j + 1) total_reward = 0 # Initialize total reward for the current episode episode_policy = [] # List to store actions taken in the current episode for i in range(t_s): print("Time step: ", i + 1) random_action = env1.action_space.sample() n_s, reward, done, info, trans_prob = env1.step(random_action) env1.render() print(n_s, reward, done, info, trans_prob) # Q2: Append the action to the episode_policy episode_policy.append(random_action) total_reward += reward # Add the reward to the total_reward for the episode if done: break # Append the total_reward and episode_policy to their respective lists total_rewards.append(total_reward) episode_policies.append(episode_policy) print("Total Reward for Episode {}: {}".format(j + 1, total_reward)) # Q1: Print the total reward for each episode in the specified format print("\n[Episode - Total reward]") for episode, reward in enumerate(total_rewards, 1): print("[{:8} - {:8}]".format(episode, reward)) # Q2: Find the episode with the highest reward and print the policy best_episode_index = total_rewards.index(max(total_rewards)) best_episode_policy = episode_policies[best_episode_index] # Print the policy for the episode with the highest reward print("\nPolicy for Episode with Highest Reward (Episode {}):".format(best_episode_index + 1)) for step, action in enumerate(best_episode_policy): print("Step {}: {}".format(step + 1, action))
Editor is loading...
Leave a Comment