Untitled
unknown
plain_text
21 days ago
3.7 kB
3
Indexable
Never
import numpy as np # 定义网格大小和障碍位置 grid_size = 5 mountains = [(1, 3), (2, 1), (3, 1)] # 定义奖励函数 R(s, a) reward_grid = np.zeros((grid_size,grid_size)) reward_grid[4, 4] = 1 # 宝箱位置奖励为 +1 reward_grid[2, 3] = -1 # 闪电位置奖励为 -1 # 检查一个位置是否为有效状态(不是边界外或者山) def is_valid_state(x, y): return 0 <= x < grid_size and 0 <= y < grid_size and (x, y) not in mountains # 定义动作 {上, 下, 左, 右},并分别表示其移动的方向 actions = { 0: (1, 0), # 上 1: (-1, 0), # 下 2: (0, -1), # 左 3: (0, 1) # 右 } # 策略矩阵 policy = np.array([ [3, 3, 3, 3, 0], [2, 0, 2, 2, 0], [0, 0, 3, 3, 3], [0, 1, 1, 1, 0], [0, 3, 3, 0, 0] ]) # 初始化转移矩阵,矩阵大小为 (grid_size*grid_size, grid_size*grid_size) transition_matrix = np.zeros((grid_size * grid_size, grid_size * grid_size)) # 将 (x, y) 的坐标转换为状态索引 def to_state(x, y): return x * grid_size + y # 遍历网格中的每个状态,按照策略进行动作 for x in range(grid_size): for y in range(grid_size): current_state = to_state(x, y) # 根据策略矩阵找到动作 action = policy[x, y] dx, dy = actions[action] # 目标位置 nx, ny = x + dx, y + dy #in lightning and treasure chest if(x,y) in [(2,3),(4,4)]: transition_matrix[current_state, current_state] = 1 continue # 如果目标位置有效,则有 85% 的概率移动到目标位置 if is_valid_state(nx, ny): target_state = to_state(nx, ny) transition_matrix[current_state, target_state] += 0.85 else: # 如果下一状态是山,智能体不会移动,保持在当前状态 if (nx, ny) in mountains: transition_matrix[current_state, current_state] = 1 continue # 如果目标位置在边界,则停留在当前状态 else: transition_matrix[current_state, current_state] += 0.85 # 处理其余的 5% 概率随机移动到邻近的其他单元 for other_action, (odx, ody) in actions.items(): if other_action != action: onx, ony = x + odx, y + ody if is_valid_state(onx, ony): other_state = to_state(onx, ony) transition_matrix[current_state, other_state] += 0.05 else: # 如果下一状态是山,智能体不会移动,保持在当前状态 if (onx, ony) in mountains: transition_matrix[current_state, current_state] = 1 continue # 如果目标位置在边界,则停留在当前状态 else: transition_matrix[current_state, current_state] += 0.05 # 输出转移矩阵 # print(transition_matrix) P_pi=transition_matrix R_pi = reward_grid.reshape((25, 1)) # 构建单位矩阵 I I = np.identity(25) gamma=0.95 # print("V_pi", I - gamma * transition_matrix) print("V_pi——invert", np.linalg.inv(I - gamma * np.array(transition_matrix))) # 计算价值函数 V_pi V_pi = np.linalg.inv(I - gamma * transition_matrix).dot(R_pi) # V_pi = np.linalg.solve(I - gamma * P_pi, R_pi) # 将一维数组转换回二维格式以方便理解 V_pi_2d = V_pi.reshape(grid_size) # 输出所有状态的价值函数 print(V_pi_2d)
Leave a Comment