def first_visit_MC(start_state,n_states,n_episodes,gamma):
V = 0.5*np.ones(n_states)
V[0] = 0
V[n_states-1] = 0
returns = [[] for s in range(n_states)]
for e in range(n_episodes):
### FILL IN THE GAP HERE:
# - generate the trajectory of the episode (using get_trajectory())
# - keep track of returns (for first-visits of a state)
# - update the state values
trajectory = get_trajectory(start_state, n_states)
G = 0
for t in range(len(trajectory), 0, -1):
state = trajectory[0][t-1]
reward = trajectory[1][t-1]
G = gamma*G + reward
if state not in trajectory[0][:t]:
returns[state].append(G)
V[state] = np.mean(returns[state])
return V