Untitled

import numpy as np

# 定义网格大小和障碍位置
grid_size = 5
mountains = [(1, 3), (2, 1), (3, 1)]


# 定义奖励函数 R(s, a)
reward_grid = np.zeros((grid_size,grid_size))
reward_grid[4, 4] = 1  # 宝箱位置奖励为 +1
reward_grid[2, 3] = -1  # 闪电位置奖励为 -1


# 检查一个位置是否为有效状态（不是边界外或者山）
def is_valid_state(x, y):
    return 0 <= x < grid_size and 0 <= y < grid_size and (x, y) not in mountains


# 定义动作 {上, 下, 左, 右}，并分别表示其移动的方向
actions = {
    0: (1, 0),  # 上
    1: (-1, 0),  # 下
    2: (0, -1),  # 左
    3: (0, 1)  # 右
}

# 策略矩阵
policy = np.array([
    [3, 3, 3, 3, 0],
    [2, 0, 2, 2, 0],
    [0, 0, 3, 3, 3],
    [0, 1, 1, 1, 0],
    [0, 3, 3, 0, 0]
])

# 初始化转移矩阵，矩阵大小为 (grid_size*grid_size, grid_size*grid_size)
transition_matrix = np.zeros((grid_size * grid_size, grid_size * grid_size))


# 将 (x, y) 的坐标转换为状态索引
def to_state(x, y):
    return x * grid_size + y


# 遍历网格中的每个状态，按照策略进行动作
for x in range(grid_size):
    for y in range(grid_size):
        current_state = to_state(x, y)

        # 根据策略矩阵找到动作
        action = policy[x, y]
        dx, dy = actions[action]

        # 目标位置
        nx, ny = x + dx, y + dy
        #in lightning and treasure chest
        if(x,y) in [(2,3),(4,4)]:
            transition_matrix[current_state, current_state] = 1
            continue


        # 如果目标位置有效，则有 85% 的概率移动到目标位置
        if is_valid_state(nx, ny):
            target_state = to_state(nx, ny)
            transition_matrix[current_state, target_state] += 0.85
        else:
            # 如果下一状态是山，智能体不会移动，保持在当前状态
            if (nx, ny) in mountains:
                transition_matrix[current_state, current_state] = 1
                continue
            # 如果目标位置在边界，则停留在当前状态
            else:
                transition_matrix[current_state, current_state] += 0.85

        # 处理其余的 5% 概率随机移动到邻近的其他单元
        for other_action, (odx, ody) in actions.items():
            if other_action != action:
                onx, ony = x + odx, y + ody
                if is_valid_state(onx, ony):
                    other_state = to_state(onx, ony)
                    transition_matrix[current_state, other_state] += 0.05
                else:
                    # 如果下一状态是山，智能体不会移动，保持在当前状态
                    if (onx, ony) in mountains:
                        transition_matrix[current_state, current_state] = 1
                        continue
                    # 如果目标位置在边界，则停留在当前状态
                    else:
                        transition_matrix[current_state, current_state] += 0.05

# 输出转移矩阵
# print(transition_matrix)


P_pi=transition_matrix


R_pi = reward_grid.reshape((25, 1))

# 构建单位矩阵 I
I = np.identity(25)

gamma=0.95

# print("V_pi", I - gamma * transition_matrix)


print("V_pi——invert", np.linalg.inv(I - gamma * np.array(transition_matrix)))

# 计算价值函数 V_pi
V_pi = np.linalg.inv(I - gamma * transition_matrix).dot(R_pi)

# V_pi = np.linalg.solve(I - gamma * P_pi, R_pi)

# 将一维数组转换回二维格式以方便理解
V_pi_2d = V_pi.reshape(grid_size)

# 输出所有状态的价值函数
print(V_pi_2d)
Editor is loading...