Untitled
unknown
plain_text
a year ago
3.7 kB
12
Indexable
import numpy as np
# 定义网格大小和障碍位置
grid_size = 5
mountains = [(1, 3), (2, 1), (3, 1)]
# 定义奖励函数 R(s, a)
reward_grid = np.zeros((grid_size,grid_size))
reward_grid[4, 4] = 1 # 宝箱位置奖励为 +1
reward_grid[2, 3] = -1 # 闪电位置奖励为 -1
# 检查一个位置是否为有效状态(不是边界外或者山)
def is_valid_state(x, y):
return 0 <= x < grid_size and 0 <= y < grid_size and (x, y) not in mountains
# 定义动作 {上, 下, 左, 右},并分别表示其移动的方向
actions = {
0: (1, 0), # 上
1: (-1, 0), # 下
2: (0, -1), # 左
3: (0, 1) # 右
}
# 策略矩阵
policy = np.array([
[3, 3, 3, 3, 0],
[2, 0, 2, 2, 0],
[0, 0, 3, 3, 3],
[0, 1, 1, 1, 0],
[0, 3, 3, 0, 0]
])
# 初始化转移矩阵,矩阵大小为 (grid_size*grid_size, grid_size*grid_size)
transition_matrix = np.zeros((grid_size * grid_size, grid_size * grid_size))
# 将 (x, y) 的坐标转换为状态索引
def to_state(x, y):
return x * grid_size + y
# 遍历网格中的每个状态,按照策略进行动作
for x in range(grid_size):
for y in range(grid_size):
current_state = to_state(x, y)
# 根据策略矩阵找到动作
action = policy[x, y]
dx, dy = actions[action]
# 目标位置
nx, ny = x + dx, y + dy
#in lightning and treasure chest
if(x,y) in [(2,3),(4,4)]:
transition_matrix[current_state, current_state] = 1
continue
# 如果目标位置有效,则有 85% 的概率移动到目标位置
if is_valid_state(nx, ny):
target_state = to_state(nx, ny)
transition_matrix[current_state, target_state] += 0.85
else:
# 如果下一状态是山,智能体不会移动,保持在当前状态
if (nx, ny) in mountains:
transition_matrix[current_state, current_state] = 1
continue
# 如果目标位置在边界,则停留在当前状态
else:
transition_matrix[current_state, current_state] += 0.85
# 处理其余的 5% 概率随机移动到邻近的其他单元
for other_action, (odx, ody) in actions.items():
if other_action != action:
onx, ony = x + odx, y + ody
if is_valid_state(onx, ony):
other_state = to_state(onx, ony)
transition_matrix[current_state, other_state] += 0.05
else:
# 如果下一状态是山,智能体不会移动,保持在当前状态
if (onx, ony) in mountains:
transition_matrix[current_state, current_state] = 1
continue
# 如果目标位置在边界,则停留在当前状态
else:
transition_matrix[current_state, current_state] += 0.05
# 输出转移矩阵
# print(transition_matrix)
P_pi=transition_matrix
R_pi = reward_grid.reshape((25, 1))
# 构建单位矩阵 I
I = np.identity(25)
gamma=0.95
# print("V_pi", I - gamma * transition_matrix)
print("V_pi——invert", np.linalg.inv(I - gamma * np.array(transition_matrix)))
# 计算价值函数 V_pi
V_pi = np.linalg.inv(I - gamma * transition_matrix).dot(R_pi)
# V_pi = np.linalg.solve(I - gamma * P_pi, R_pi)
# 将一维数组转换回二维格式以方便理解
V_pi_2d = V_pi.reshape(grid_size)
# 输出所有状态的价值函数
print(V_pi_2d)
Editor is loading...
Leave a Comment