Untitled
unknown
plain_text
a year ago
2.2 kB
11
Indexable
# Initialize values
def initialize_values():
V = np.zeros(grid_size)
for s in rewards.keys():
V[s] = rewards[s]
return V
# Implement Value Iteration
def value_iteration():
V = initialize_values()
policy = np.zeros(grid_size, dtype=int)
theta = 0.0001
delta = float('inf')
while delta > theta:
delta = 0
new_V = np.copy(V)
for x in range(grid_size[0]):
for y in range(grid_size[1]):
state = (x, y)
if state in blocked or state in rewards:
continue
values = []
for action in actions:
transitions = transition_probabilities(state, action)
value = sum(prob * (reward_function(next_state) + discount_factor * V[next_state]) for next_state, prob in transitions.items())
values.append(value)
new_V[state] = max(values)
policy[state] = np.argmax(values)
delta = max(delta, abs(new_V[state] - V[state]))
V = new_V
return V, policy
# Extract the policy from the policy indices
def extract_policy(policy):
action_map = {0: 'N', 1: 'S', 2: 'E', 3: 'W'}
return np.vectorize(lambda x: action_map[x])(policy)
# Compute Expected Utility
def compute_expected_utility(trajectory):
total_reward = sum(reward_function(state) for state in trajectory)
return total_reward / len(trajectory)
# Test a sample trajectory
def generate_trajectory(policy):
state = (0, 0)
trajectory = [state]
while state not in rewards:
action = actions[policy[state]]
transitions = transition_probabilities(state, action)
next_state = max(transitions, key=transitions.get)
trajectory.append(next_state)
state = next_state
return trajectory
V, policy_indices = value_iteration()
policy = extract_policy(policy_indices)
trajectory = generate_trajectory(policy_indices)
print("Trajectory from (0, 0):", trajectory)
expected_utility = compute_expected_utility(trajectory)
print("Expected Utility of (0, 0):", expected_utility)
# Output results
print("Optimal Value Function:\n", V)
print("Optimal Policy:\n", policy)
Editor is loading...
Leave a Comment