Source code for marlax.agents.qagent

"""
QAgent implementation using Q-learning for MARLAX.

This agent maintains a Q-table mapping global states to action values
and selects actions via an epsilon-greedy strategy.
"""

from marlax.abstracts import Agent

import random

[docs] class QAgent(Agent): """ Q-learning agent that chooses actions based on a Q-table. Attributes: position (tuple): Agent's (x, y) position on the grid. actions (list): List of possible actions. q_table (dict): Maps state_key to a dict of action->value. """
[docs] def __init__(self, init_position = None, actions = ['stay', 'up', 'down', 'left', 'right']): """ Initialize an agent with a starting position and possible actions. Args: init_position (tuple, optional): The (x, y) starting coordinates. Defaults to None. actions (list of str, optional): Available actions. Defaults to ['stay', 'up', 'down', 'left', 'right']. """ self.position = init_position # Agent's (x, y) position on the grid. self.actions = actions # List of possible actions. # Q-table: maps global state (all agents' positions + active reward target) to action values. # default dict with partial that defaults to 0.0 self.q_table = {}
[docs] def choose(self, possible_states, epsilon=0.1, agent_id = 0): """ Select an action using an epsilon-greedy policy. Args: possible_states (list): List of global state keys to evaluate. epsilon (float): Exploration probability. Defaults to 0.1. agent_id (int): Identifier for this agent among multiple agents. Defaults to 0. Returns: str: The chosen action. """ if random.random() < epsilon: return random.choice(self.actions) else: # for all the possible states, get the action with the highest q-value best_state = self.get_max_state(possible_states) best_possible_q_value = float('-inf') best_possible_action = None for action in self.actions: if self.q_table[best_state][action] > best_possible_q_value: best_possible_action = action best_possible_q_value = self.q_table[best_state][action] return best_possible_action
[docs] def get_max_state(self, possible_states): """ Identify the state with the highest max Q-value. This helper initializes missing table entries to zero. Args: possible_states (list): List of global state keys. Returns: any: The state_key with the highest action value. """ best_possible_action = None best_possible_q_value = float('-inf') best_state = None for state_key in possible_states: if state_key not in self.q_table: self.q_table[state_key] = {a: 0.0 for a in self.actions} best_action = None best_q_value = float('-inf') for action in self.actions: if self.q_table[state_key][action] > best_q_value: best_action = action best_q_value = self.q_table[state_key][action] if best_q_value > best_possible_q_value: best_possible_action = best_action best_possible_q_value = best_q_value best_state = state_key return best_state
[docs] def update(self, state_key, action, reward, next_state_key, alpha=0.1, gamma=0.99): """ Update the Q-table entry for a given state and action. Applies the Q-learning update rule: $$Q(s,a) = Q(s,a) + alpha * (r + gamma * max_a Q(s',a) - Q(s,a))$$ where: - $s$ is the current state, - $a$ is the action taken, - $r$ is the received reward, - $s'$ is the next state, - $max_a Q(s',a)$ is the maximum Q-value for the next state. Args: state_key (any): Current global state key. action (str): Action taken by the agent. reward (float): Reward received after action. next_state_key (any): Next global state key. alpha (float): Learning rate. Defaults to 0.1. gamma (float): Discount factor. Defaults to 0.99. """ if state_key not in self.q_table: self.q_table[state_key] = {a: 0.0 for a in self.actions} if next_state_key not in self.q_table: self.q_table[next_state_key] = {a: 0.0 for a in self.actions} best_next_value = max(self.q_table[next_state_key].values()) td_target = reward + gamma * best_next_value td_error = td_target - self.q_table[state_key][action] self.q_table[state_key][action] += alpha * td_error