Untitled
unknown
python
2 years ago
1.9 kB
10
Indexable
import numpy as np
# 定義狀態和行動
states = ['standing', 'moving', 'fallen']
actions = ['move fast', 'move slow']
# 定義行動和狀態轉移的機率和回報
actions_prob = {
'move fast': {
'standing': {'standing': 0, 'moving': 0.6, 'fallen': 0.4},
'moving': {'standing': 0, 'moving': 0.8, 'fallen': 0.2},
'fallen': {'standing': 0, 'moving': 0, 'fallen': 0}
},
'move slow': {
'standing': {'standing': 0, 'moving': 1, 'fallen': 0},
'moving': {'standing': 0, 'moving': 1, 'fallen': 0},
'fallen': {'standing': 0.4, 'moving': 0, 'fallen': 0.6}
}
}
actions_reward = {
'move fast': {
'standing': {'standing': 0, 'moving': 2, 'fallen': -1},
'moving': {'standing': 0, 'moving': 2, 'fallen': -1},
'fallen': {'standing': 0, 'moving': 0, 'fallen': 0}
},
'move slow': {
'standing': {'standing': 0, 'moving': 1, 'fallen': 0},
'moving': {'standing': 0, 'moving': 1, 'fallen': 0},
'fallen': {'standing': 1, 'moving': 0, 'fallen': -1}
}
}
# 初始化狀態值函數
V = {
'standing': 0,
'moving': 0,
'fallen': 0
}
# 貝爾曼方程式迭代計算
gamma = 1
num_iterations = 5
for _ in range(num_iterations):
new_V = {}
for state in states:
values = []
for action in actions:
action_prob = actions_prob[action][state]
action_reward = actions_reward[action][state]
value = 0
for next_state in states:
value += action_prob[next_state] * (action_reward[next_state] \
+ gamma * V[next_state])
values.append(value)
print(actions[values.index(max(values))])
new_V[state] = max(values)
V = new_V
print('k={}---------'.format(_+1))
# 輸出結果
for state in states:
print(f"State: {state}, Value: {V[state]}")Editor is loading...