[Feature] allow different number of discrete actions for each action dimension (#119)

rikifunt · matteobettini · web-flow · commit 21a319965918 · 2024-07-26T08:53:04.000+01:00
* support custom nvec for discrete actions

Changes:
- add action_nvec property to the Dynamics ABC (defaults to 3s as before when not overridden)
- add Agent.action_nvec
- In Environment: update get_agent_action_space, get_random_action, _set_action to support Agent.action_nvec

* add composite dynamics

Changes:
- add simulator.dynamics.composite with Composite class
- add Rotation to simulator.dynamics.holonomic_with_rot

* revert changes to dynamics, improve logic for bc-compatibility

* Apply suggestions from code review

Co-authored-by: Matteo Bettini &lt;55539777+matteobettini@users.noreply.github.com&gt;

* fix discrete to multi-discrete mapping, add tests

* Apply suggestions from code review

Co-authored-by: Matteo Bettini &lt;55539777+matteobettini@users.noreply.github.com&gt;

* improve tests

---------

Co-authored-by: Matteo Bettini &lt;55539777+matteobettini@users.noreply.github.com&gt;
diff --git a/tests/test_vmas.py b/tests/test_vmas.py
@@ -1,7 +1,9 @@
 #  Copyright (c) 2022-2024.
 #  ProrokLab (https://www.proroklab.org/)
 #  All rights reserved.
+import math
 import os
+import random
 import sys
 from pathlib import Path
 
@@ -26,6 +28,14 @@ def scenario_names():
     return scenarios
 
 
+def random_nvecs(count, l_min=2, l_max=6, n_min=2, n_max=6, seed=0):
+    random.seed(seed)
+    return [
+        [random.randint(n_min, n_max) for _ in range(random.randint(l_min, l_max))]
+        for _ in range(count)
+    ]
+
+
 def test_all_scenarios_included():
     from vmas import debug_scenarios, mpe_scenarios, scenarios
 
@@ -70,6 +80,163 @@ def test_multi_discrete_actions(scenario, num_envs=10, n_steps=10):
         env.step(env.get_random_actions())
 
 
+@pytest.mark.parametrize("scenario", scenario_names())
+@pytest.mark.parametrize("multidiscrete_actions", [True, False])
+def test_discrete_action_nvec(scenario, multidiscrete_actions, num_envs=10, n_steps=5):
+    env = make_env(
+        scenario=scenario,
+        num_envs=num_envs,
+        seed=0,
+        multidiscrete_actions=multidiscrete_actions,
+        continuous_actions=False,
+    )
+    if (
+        type(env.scenario).process_action
+        is not vmas.simulator.scenario.BaseScenario.process_action
+    ):
+        pytest.skip("Scenario uses a custom process_action method.")
+
+    random.seed(0)
+    for agent in env.world.agents:
+        agent.discrete_action_nvec = [
+            random.randint(2, 6) for _ in range(agent.action_size)
+        ]
+    env.action_space = env.get_action_space()
+
+    def to_multidiscrete(action, nvec):
+        action_multi = []
+        for i in range(len(nvec)):
+            n = math.prod(nvec[i + 1 :])
+            action_multi.append(action // n)
+            action = action % n
+        return torch.stack(action_multi, dim=-1)
+
+    def full_nvec(agent, world):
+        return list(agent.discrete_action_nvec) + (
+            [world.dim_c] if not agent.silent and world.dim_c != 0 else []
+        )
+
+    for _ in range(n_steps):
+        actions = env.get_random_actions()
+
+        # Check that generated actions are in the action space
+        for a_batch, s in zip(actions, env.action_space.spaces):
+            for a in a_batch:
+                assert a.numpy() in s
+
+        env.step(actions)
+
+        if not multidiscrete_actions:
+            actions = [
+                to_multidiscrete(a.squeeze(-1), full_nvec(agent, env.world))
+                for a, agent in zip(actions, env.world.policy_agents)
+            ]
+
+        # Check that discrete action to continuous control mapping is correct.
+        for i_a, agent in enumerate(env.world.policy_agents):
+            for i, n in enumerate(agent.discrete_action_nvec):
+                a = actions[i_a][:, i]
+                u = agent.action.u[:, i]
+                U = agent.action.u_range_tensor[i]
+                k = agent.action.u_multiplier_tensor[i]
+                for aj, uj in zip(a, u):
+                    assert aj in range(
+                        n
+                    ), f"discrete action {aj} not in [0,{n-1}] (n={n}, U={U}, k={k})"
+                    if n % 2 != 0:
+                        assert (
+                            aj != 0 or uj == 0
+                        ), f"discrete action {aj} maps to control {uj} (n={n}), U={U}, k={k})"
+                        assert (aj < 1 or aj > n // 2) or torch.isclose(
+                            uj / k, (2 * U * (aj - 1)) / (n - 1) - U
+                        ), f"discrete action {aj} maps to control {uj} (n={n}, U={U}, k={k})"
+                        assert (aj <= n // 2) or torch.isclose(
+                            uj / k, 2 * U * (aj / (n - 1)) - U
+                        ), f"discrete action {aj} maps to control {uj} (n={n}), U={U}, k={k})"
+                    else:
+                        assert torch.isclose(
+                            uj / k, 2 * U * (aj / (n - 1)) - U
+                        ), f"discrete action {aj} maps to control {uj} (n={n}), U={U}, k={k})"
+
+
+@pytest.mark.parametrize(
+    "nvecs", list(zip(random_nvecs(10, seed=0), random_nvecs(10, seed=42)))
+)
+def test_discrete_action_nvec_discrete_to_multi(
+    nvecs, scenario="transport", num_envs=10, n_steps=5
+):
+    kwargs = {
+        "scenario": scenario,
+        "num_envs": num_envs,
+        "seed": 0,
+        "continuous_actions": False,
+    }
+    env = make_env(**kwargs, multidiscrete_actions=False)
+    env_multi = make_env(**kwargs, multidiscrete_actions=True)
+    if (
+        type(env.scenario).process_action
+        is not vmas.simulator.scenario.BaseScenario.process_action
+    ):
+        pytest.skip("Scenario uses a custom process_action method.")
+
+    def set_nvec(agent, nvec):
+        agent.action_size = len(nvec)
+        agent.discrete_action_nvec = nvec
+        agent.action.action_size = agent.action_size
+
+    random.seed(0)
+    for agent, agent_multi, nvec in zip(
+        env.world.policy_agents, env_multi.world.policy_agents, nvecs
+    ):
+        set_nvec(agent, nvec)
+        set_nvec(agent_multi, nvec)
+    env.action_space = env.get_action_space()
+    env_multi.action_space = env.get_action_space()
+
+    def full_nvec(agent, world):
+        return list(agent.discrete_action_nvec) + (
+            [world.dim_c] if not agent.silent and world.dim_c != 0 else []
+        )
+
+    def full_action_size(agent, world):
+        return len(full_nvec(agent, world))
+
+    for _ in range(n_steps):
+        actions_multi = env_multi.get_random_actions()
+        prodss = [
+            [
+                math.prod(full_nvec(agent, env.world)[i + 1 :])
+                for i in range(full_action_size(agent, env.world))
+            ]
+            for agent in env.world.policy_agents
+        ]
+        # Compute the expected mapping from multi-discrete to discrete
+        actions = [
+            (a_multi * torch.tensor(prods)).sum(dim=1)
+            for a_multi, prods in zip(actions_multi, prodss)
+        ]
+
+        env_multi.step(actions_multi)
+        env.step(actions)
+
+        # Check that both discrete and multi-discrete actions result in the
+        # same control value
+        for agent, agent_multi, action, action_multi in zip(
+            env.world.policy_agents,
+            env_multi.world.policy_agents,
+            actions,
+            actions_multi,
+        ):
+            U = agent.action.u_range_tensor
+            k = agent.action.u_multiplier_tensor
+            for u, u_multi, a, a_multi in zip(
+                agent.action.u, agent_multi.action.u, action, action_multi
+            ):
+                assert torch.allclose(
+                    u, u_multi
+                ), f"{u} != {u_multi} (nvec={agent.discrete_action_nvec}, a={a}, a_multi={a_multi}, U={U}, k={k})"
+
+
 @pytest.mark.parametrize("scenario", scenario_names())
 def test_non_dict_spaces_actions(scenario, num_envs=10, n_steps=10):
     env = make_env(
diff --git a/vmas/simulator/core.py b/vmas/simulator/core.py
@@ -862,6 +862,9 @@ def __init__(
         render_action: bool = False,
         dynamics: Dynamics = None,  # Defaults to holonomic
         action_size: int = None,  # Defaults to what required by the dynamics
+        discrete_action_nvec: List[
+            int
+        ] = None,  # Defaults to 3-way discretization if discrete actions are chosen (stay, decrement, increment)
     ):
         super().__init__(
             name,
@@ -884,6 +887,17 @@ def __init__(
         if obs_range == 0.0:
             assert sensors is None, f"Blind agent cannot have sensors, got {sensors}"
 
+        if action_size is not None and discrete_action_nvec is not None:
+            if action_size != len(discrete_action_nvec):
+                raise ValueError(
+                    f"action_size {action_size} is inconsistent with discrete_action_nvec {discrete_action_nvec}"
+                )
+        if discrete_action_nvec is not None:
+            if not all(n > 1 for n in discrete_action_nvec):
+                raise ValueError(
+                    f"All values in discrete_action_nvec must be greater than 1, got {discrete_action_nvec}"
+                )
+
         # cannot observe the world
         self._obs_range = obs_range
         # observation noise
@@ -914,9 +928,16 @@ def __init__(
         # Dynamics
         self.dynamics = dynamics if dynamics is not None else Holonomic()
         # Action
-        self.action_size = (
-            action_size if action_size is not None else self.dynamics.needed_action_size
-        )
+        if action_size is not None:
+            self.action_size = action_size
+        elif discrete_action_nvec is not None:
+            self.action_size = len(discrete_action_nvec)
+        else:
+            self.action_size = self.dynamics.needed_action_size
+        if discrete_action_nvec is None:
+            self.discrete_action_nvec = [3] * self.action_size
+        else:
+            self.discrete_action_nvec = discrete_action_nvec
         self.dynamics.agent = self
         self._action = Action(
             u_range=u_range,
diff --git a/vmas/simulator/environment/environment.py b/vmas/simulator/environment/environment.py
@@ -1,6 +1,7 @@
 #  Copyright (c) 2022-2024.
 #  ProrokLab (https://www.proroklab.org/)
 #  All rights reserved.
+import math
 import random
 from ctypes import byref
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -334,13 +335,13 @@ def get_agent_action_space(self, agent: Agent):
                 dtype=np.float32,
             )
         elif self.multidiscrete_actions:
-            actions = [3] * agent.action_size + (
+            actions = agent.discrete_action_nvec + (
                 [self.world.dim_c] if not agent.silent and self.world.dim_c != 0 else []
             )
             return spaces.MultiDiscrete(actions)
         else:
             return spaces.Discrete(
-                3**agent.action_size
+                math.prod(agent.discrete_action_nvec)
                 * (
                     self.world.dim_c
                     if not agent.silent and self.world.dim_c != 0
@@ -503,41 +504,49 @@ def _set_action(self, action, agent):
             if not self.multidiscrete_actions:
                 # This bit of code translates the discrete action (taken from a space that
                 # is the cartesian product of all action spaces) into a multi discrete action.
-                # For example, if agent.action_size=4, it will mean that the agent will have
-                # 4 actions each with 3 possibilities (stay, decrement, increment).
-                # The env will have a space Discrete(3**4).
-                # This code will translate the action (with shape [n_envs,1] and range [0,3**4)) to an
-                # action with shape [n_envs,4] and range [0,3).
-                n_actions = self.get_agent_action_space(agent).n
-                action_range = torch.arange(n_actions, device=self.device).expand(
-                    self.world.batch_dim, n_actions
-                )
-                physical_action = action
-                action_range = torch.where(action_range == physical_action, 1.0, 0.0)
-                action_range = action_range.view(
-                    (self.world.batch_dim,)
-                    + (3,) * agent.action_size
-                    + (self.world.dim_c,)
-                    * (1 if not agent.silent and self.world.dim_c != 0 else 0)
+                # This is done by iteratively taking the modulo of the action and dividing by the
+                # number of actions in the current action space, which treats the action as if
+                # it was the "flat index" of the multi-discrete actions. E.g. if we have
+                # nvec = [3,2], action 0 corresponds to the actions [0,0],
+                # action 1 corresponds to the action [0,1], action 2 corresponds
+                # to the action [1,0], action 3 corresponds to the action [1,1], etc.
+                flat_action = action.squeeze(-1)
+                actions = []
+                nvec = list(agent.discrete_action_nvec) + (
+                    [self.world.dim_c]
+                    if not agent.silent and self.world.dim_c != 0
+                    else []
                 )
-                action = action_range.nonzero()[:, 1:]
+                for i in range(len(nvec)):
+                    n = math.prod(nvec[i + 1 :])
+                    actions.append(flat_action // n)
+                    flat_action = flat_action % n
+                action = torch.stack(actions, dim=-1)
 
             # Now we have an action with shape [n_envs, action_size+comms_actions]
-            for _ in range(agent.action_size):
-                physical_action = action[:, action_index].unsqueeze(-1)
+            for n in agent.discrete_action_nvec:
+                physical_action = action[:, action_index]
                 self._check_discrete_action(
-                    physical_action,
+                    physical_action.unsqueeze(-1),
                     low=0,
-                    high=3,
+                    high=n,
                     type="physical",
                 )
-
-                arr1 = physical_action == 1
-                arr2 = physical_action == 2
-
-                disc_action_value = agent.action.u_range_tensor[action_index]
-                agent.action.u[:, action_index] -= disc_action_value * arr1.squeeze(-1)
-                agent.action.u[:, action_index] += disc_action_value * arr2.squeeze(-1)
+                u_max = agent.action.u_range_tensor[action_index]
+                # For odd n we want the first action to always map to u=0, so
+                # we swap 0 values with the middle value, and shift the first
+                # half of the remaining values by -1.
+                if n % 2 != 0:
+                    stay = physical_action == 0
+                    decrement = (physical_action > 0) & (physical_action <= n // 2)
+                    physical_action[stay] = n // 2
+                    physical_action[decrement] -= 1
+                # We know u must be in [-u_max, u_max], and we know action is
+                # in [0, n-1]. Conversion steps: [0, n-1] -> [0, 1] -> [0, 2*u_max] -> [-u_max, u_max]
+                # E.g. action 0 -> -u_max, action n-1 -> u_max, action 1 -> -u_max + 2*u_max/(n-1)
+                agent.action.u[:, action_index] = (physical_action / (n - 1)) * (
+                    2 * u_max
+                ) - u_max
 
                 action_index += 1