Source code for marlax.agents.qagent

"""
QAgent implementation using Q-learning for MARLAX.

This agent maintains a Q-table mapping global states to action values
and selects actions via an epsilon-greedy strategy.
"""

from marlax.abstracts import Agent

import random


[docs]
class QAgent(Agent):
    """
    Q-learning agent that chooses actions based on a Q-table.

    Attributes:
        position (tuple): Agent's (x, y) position on the grid.
        actions (list): List of possible actions.
        q_table (dict): Maps state_key to a dict of action->value.
    """

[docs]
    def __init__(self, init_position = None, actions = ['stay', 'up', 'down', 'left', 'right']):
        """
        Initialize an agent with a starting position and possible actions.

        Args:
            init_position (tuple, optional): The (x, y) starting coordinates. Defaults to None.
            actions (list of str, optional): Available actions. Defaults to ['stay', 'up', 'down', 'left', 'right'].
        """
        self.position = init_position  # Agent's (x, y) position on the grid.
        self.actions = actions # List of possible actions.
        # Q-table: maps global state (all agents' positions + active reward target) to action values.
        # default dict with partial that  defaults to 0.0
        self.q_table = {}



[docs]
    def choose(self, possible_states, epsilon=0.1, agent_id = 0):
        """
        Select an action using an epsilon-greedy policy.

        Args:
            possible_states (list): List of global state keys to evaluate.
            epsilon (float): Exploration probability. Defaults to 0.1.
            agent_id (int): Identifier for this agent among multiple agents. Defaults to 0.

        Returns:
            str: The chosen action.
        """
        if random.random() < epsilon:
            return random.choice(self.actions)
        else:
            # for all the possible states, get the action with the highest q-value
            best_state = self.get_max_state(possible_states)
            best_possible_q_value = float('-inf')
            best_possible_action = None

            for action in self.actions:
                if self.q_table[best_state][action] > best_possible_q_value:
                    best_possible_action = action
                    best_possible_q_value = self.q_table[best_state][action]
            
            return best_possible_action

        

[docs]
    def get_max_state(self, possible_states):
        """
        Identify the state with the highest max Q-value.

        This helper initializes missing table entries to zero.

        Args:
            possible_states (list): List of global state keys.

        Returns:
            any: The state_key with the highest action value.
        """
        best_possible_action = None
        best_possible_q_value = float('-inf')
        best_state = None
        
        for state_key in possible_states:
            
            if state_key not in self.q_table:
                self.q_table[state_key] = {a: 0.0 for a in self.actions}
            
            best_action = None
            best_q_value = float('-inf')

            for action in self.actions:
                if self.q_table[state_key][action] > best_q_value:
                    best_action = action
                    best_q_value = self.q_table[state_key][action]
            
            if best_q_value > best_possible_q_value:
                best_possible_action = best_action
                best_possible_q_value = best_q_value
                best_state = state_key
        
        return best_state



[docs]
    def update(self, state_key, action, reward, next_state_key, alpha=0.1, gamma=0.99):
        """
        Update the Q-table entry for a given state and action.

        Applies the Q-learning update rule:
        $$Q(s,a) = Q(s,a) + alpha * (r + gamma * max_a Q(s',a) - Q(s,a))$$
        
        where:
        - $s$ is the current state,
        - $a$ is the action taken,
        - $r$ is the received reward,
        - $s'$ is the next state,
        - $max_a Q(s',a)$ is the maximum Q-value for the next state.

        Args:
            state_key (any): Current global state key.
            action (str): Action taken by the agent.
            reward (float): Reward received after action.
            next_state_key (any): Next global state key.
            alpha (float): Learning rate. Defaults to 0.1.
            gamma (float): Discount factor. Defaults to 0.99.
        """
        if state_key not in self.q_table:
            self.q_table[state_key] = {a: 0.0 for a in self.actions}
        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = {a: 0.0 for a in self.actions}
        best_next_value = max(self.q_table[next_state_key].values())
        td_target = reward + gamma * best_next_value
        td_error = td_target - self.q_table[state_key][action]
        self.q_table[state_key][action] += alpha * td_error