Untitled
unknown
python
2 years ago
813 B
6
Indexable
def first_visit_MC(start_state,n_states,n_episodes,gamma): V = 0.5*np.ones(n_states) V[0] = 0 V[n_states-1] = 0 returns = [[] for s in range(n_states)] for e in range(n_episodes): ### FILL IN THE GAP HERE: # - generate the trajectory of the episode (using get_trajectory()) # - keep track of returns (for first-visits of a state) # - update the state values trajectory = get_trajectory(start_state, n_states) G = 0 for t in range(len(trajectory), 0, -1): state = trajectory[0][t-1] reward = trajectory[1][t-1] G = gamma*G + reward if state not in trajectory[0][:t]: returns[state].append(G) V[state] = np.mean(returns[state]) return V
Editor is loading...