Untitled

 avatar
unknown
python
2 years ago
813 B
3
Indexable
def first_visit_MC(start_state,n_states,n_episodes,gamma):
    V = 0.5*np.ones(n_states)
    V[0] = 0
    V[n_states-1] = 0
    returns = [[] for s in range(n_states)]
    for e in range(n_episodes):
        ### FILL IN THE GAP HERE:
        #   - generate the trajectory of the episode (using get_trajectory())
        #   - keep track of returns (for first-visits of a state)
        #   - update the state values
        trajectory = get_trajectory(start_state, n_states)
        G = 0
        for t in range(len(trajectory), 0, -1):
            state = trajectory[0][t-1]
            reward = trajectory[1][t-1]
            G = gamma*G + reward
            if state not in trajectory[0][:t]:
                returns[state].append(G)
                V[state] = np.mean(returns[state])
            
    return V