Untitled
unknown
plain_text
a month ago
2.2 kB
4
Indexable
Never
# Initialize values def initialize_values(): V = np.zeros(grid_size) for s in rewards.keys(): V[s] = rewards[s] return V # Implement Value Iteration def value_iteration(): V = initialize_values() policy = np.zeros(grid_size, dtype=int) theta = 0.0001 delta = float('inf') while delta > theta: delta = 0 new_V = np.copy(V) for x in range(grid_size[0]): for y in range(grid_size[1]): state = (x, y) if state in blocked or state in rewards: continue values = [] for action in actions: transitions = transition_probabilities(state, action) value = sum(prob * (reward_function(next_state) + discount_factor * V[next_state]) for next_state, prob in transitions.items()) values.append(value) new_V[state] = max(values) policy[state] = np.argmax(values) delta = max(delta, abs(new_V[state] - V[state])) V = new_V return V, policy # Extract the policy from the policy indices def extract_policy(policy): action_map = {0: 'N', 1: 'S', 2: 'E', 3: 'W'} return np.vectorize(lambda x: action_map[x])(policy) # Compute Expected Utility def compute_expected_utility(trajectory): total_reward = sum(reward_function(state) for state in trajectory) return total_reward / len(trajectory) # Test a sample trajectory def generate_trajectory(policy): state = (0, 0) trajectory = [state] while state not in rewards: action = actions[policy[state]] transitions = transition_probabilities(state, action) next_state = max(transitions, key=transitions.get) trajectory.append(next_state) state = next_state return trajectory V, policy_indices = value_iteration() policy = extract_policy(policy_indices) trajectory = generate_trajectory(policy_indices) print("Trajectory from (0, 0):", trajectory) expected_utility = compute_expected_utility(trajectory) print("Expected Utility of (0, 0):", expected_utility) # Output results print("Optimal Value Function:\n", V) print("Optimal Policy:\n", policy)
Leave a Comment