q-learning example


import numpy as np
import matplotlib.pyplot as plt

# Environment definition
n_states = 16  # 4x4 Grid World
n_actions = 4  # UP, DOWN, LEFT, RIGHT
goal_state = 15  # Terminal state
Q_table = np.zeros((n_states, n_actions))  # Initialize Q-table

# Parameters
learning_rate = 0.8
discount_factor = 0.95
epsilon = 1.0  # Exploration probability
epsilon_decay = 0.99
epsilon_min = 0.01
epochs = 1000

# Function to get next state based on action
def get_next_state(state, action):
    row, col = divmod(state, 4)  # 4x4 grid coordinates
    if action == 0: row = max(0, row - 1)  # UP
    if action == 1: row = min(3, row + 1)  # DOWN
    if action == 2: col = max(0, col - 1)  # LEFT
    if action == 3: col = min(3, col + 1)  # RIGHT
    return row * 4 + col  # Return new state index

# Q-Learning algorithm
rewards_per_episode = []
for epoch in range(epochs):
    state = 0  # Start at state 0
    total_reward = 0
    while state != goal_state:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = np.random.randint(0, n_actions)  # Explore
        else:
            action = np.argmax(Q_table[state])  # Exploit

        next_state = get_next_state(state, action)
        reward = 1.0 if next_state == goal_state else -0.1
        total_reward += reward

        # Update Q-table
        Q_table[state, action] += learning_rate * (
            reward + discount_factor * np.max(Q_table[next_state]) - Q_table[state, action]
        )

        state = next_state  # Move to next state

    epsilon = max(epsilon_min, epsilon * epsilon_decay)  # Decay epsilon
    rewards_per_episode.append(total_reward)

# Visualization of rewards over episodes
plt.plot(rewards_per_episode)
plt.title("Q-Learning Performance Over Episodes")
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.grid()
plt.show()
Editor is loading...