Untitled

mail@pastecode.io avatar
unknown
python
a year ago
1.9 kB
1
Indexable
Never
import numpy as np

# 定義狀態和行動
states = ['standing', 'moving', 'fallen']
actions = ['move fast', 'move slow']

# 定義行動和狀態轉移的機率和回報
actions_prob = {
    'move fast': {
        'standing': {'standing': 0, 'moving': 0.6, 'fallen': 0.4},
        'moving': {'standing': 0, 'moving': 0.8, 'fallen': 0.2},
        'fallen': {'standing': 0, 'moving': 0, 'fallen': 0}
    },
    'move slow': {
        'standing': {'standing': 0, 'moving': 1, 'fallen': 0},
        'moving': {'standing': 0, 'moving': 1, 'fallen': 0},
        'fallen': {'standing': 0.4, 'moving': 0, 'fallen': 0.6}
    }
}

actions_reward = {
    'move fast': {
        'standing': {'standing': 0, 'moving': 2, 'fallen': -1},
        'moving': {'standing': 0, 'moving': 2, 'fallen': -1},
        'fallen': {'standing': 0, 'moving': 0, 'fallen': 0}
    },
    'move slow': {
        'standing': {'standing': 0, 'moving': 1, 'fallen': 0},
        'moving': {'standing': 0, 'moving': 1, 'fallen': 0},
        'fallen': {'standing': 1, 'moving': 0, 'fallen': -1}
    }
}

# 初始化狀態值函數
V = {
    'standing': 0,
    'moving': 0,
    'fallen': 0
}

# 貝爾曼方程式迭代計算
gamma = 1
num_iterations = 5


for _ in range(num_iterations):
    new_V = {}
    for state in states:
        values = []
        for action in actions:
            action_prob = actions_prob[action][state]
            action_reward = actions_reward[action][state]
            value = 0
            for next_state in states:
                value += action_prob[next_state] * (action_reward[next_state] \
                + gamma * V[next_state])
            values.append(value)
        print(actions[values.index(max(values))])
        new_V[state] = max(values)
    V = new_V
    print('k={}---------'.format(_+1))

# 輸出結果
for state in states:
    print(f"State: {state}, Value: {V[state]}")