Rename SPDBatchNormLie parameter n to num_features for API consistency

bruAristimunha · bruAristimunha · commit 21a21cf95cde · 2026-03-20T16:26:43.000+01:00
All other batchnorm modules (SPDBatchNormMean, SPDBatchNormMeanVar,
BatchReNorm) use `num_features` as their matrix-size parameter.
This aligns SPDBatchNormLie with the same convention.
diff --git a/spd_learn/modules/liebn.py b/spd_learn/modules/liebn.py
@@ -43,32 +43,121 @@
 class SPDBatchNormLie(nn.Module):
     r"""Lie Group Batch Normalization for SPD matrices.
 
-    This class implements the SPD instance of the LieBN framework, using
-    the three Lie group structures on the SPD manifold, corresponding to the AIM, LEM, and LCM.
+    Implements the LieBN framework :cite:p:`chen2024liebn` for SPD manifolds.
+    Unlike :class:`SPDBatchNormMeanVar`, which normalizes under a single
+    Riemannian metric (AIRM), this layer exploits the **Lie group structure**
+    of three classical SPD geometries to define centering, scaling, and biasing
+    as group-theoretic operations with formal statistical guarantees.
+
+    **Algorithm.**
+    Given a batch :math:`\{P_i\}_{i=1}^N \subset \mathcal{S}_{++}^n`, the
+    forward pass applies three steps in the Lie algebra selected by ``metric``:
+
+    1. **Centering** -- translate the batch mean :math:`M` to the group
+       identity :math:`E` via the inverse left translation:
+
+       .. math::
+
+           \bar{P}_i = L_{M_\odot^{-1}}(P_i)
+
+    2. **Scaling** -- normalize the Fréchet variance :math:`v^2` with a
+       learnable shift :math:`s \in \mathbb{R}_{>0}`:
+
+       .. math::
+
+           \hat{P}_i = \operatorname{Exp}_E
+           \!\left[\frac{s}{\sqrt{v^2 + \epsilon}}\,
+           \operatorname{Log}_E(\bar{P}_i)\right]
+
+    3. **Biasing** -- translate to the learnable SPD parameter :math:`B`:
+
+       .. math::
+
+           \tilde{P}_i = L_B(\hat{P}_i)
+
+    **Theoretical guarantees** (Proposition 4.2 of the paper):
+
+    * *Mean control*: after centering and biasing with :math:`B = E`,
+      the Fréchet mean of the output batch equals :math:`E`.
+    * *Variance control*: after scaling, the output dispersion satisfies
+      :math:`\sum_i w_i\,d^2(\hat{P}_i, E) = s^2`.
+
+    **Supported metrics.**
+    The ``metric`` parameter selects one of three Lie group structures, each
+    inducing a family of parameterized metrics via the power deformation
+    :math:`\mathrm{P}_\theta`.  The table below summarizes how each step is
+    realized (see Table 2 in :cite:p:`chen2024liebn`):
+
+    .. list-table::
+       :header-rows: 1
+       :widths: 25 25 25 25
+
+       * - Operation
+         - :math:`(\theta,\alpha,\beta)`-AIM
+         - :math:`(\alpha,\beta)`-LEM
+         - :math:`\theta`-LCM
+       * - Pullback map
+         - :math:`\mathrm{P}_\theta`
+         - :math:`\operatorname{mlog}`
+         - :math:`\psi_{\mathrm{LC}} \circ \mathrm{P}_\theta`
+       * - Left translation :math:`L_Q(P)`
+         - :math:`Q^{1/2} P\, Q^{1/2}`
+         - :math:`P + Q`
+         - :math:`P + Q`
+       * - Scaling
+         - :math:`\operatorname{Exp}_I[s\,\operatorname{Log}_I(P)]`
+         - :math:`s \cdot P`
+         - :math:`s \cdot P`
+       * - Fréchet mean
+         - Karcher flow
+         - Arithmetic mean
+         - Arithmetic mean
+       * - Running mean update
+         - AIRM geodesic
+         - Linear interpolation
+         - Linear interpolation
+
+    **Bi-invariant distance.**
+    The Fréchet variance uses the :math:`(\alpha, \beta)` bi-invariant metric
+    (Definition 3 and Eq. 3 of the paper):
+
+    .. math::
+
+        d^2(P, Q) = \alpha \lVert V \rVert_F^2
+                   + \beta \, g(V)^2
+
+    where :math:`V` is the tangent representation (log-map) and
+    :math:`g(V) = \log\det(P)` for AIM or :math:`\operatorname{tr}(V)`
+    for LEM/LCM.  The variance is normalized by :math:`\theta^2` for AIM
+    and LCM.
 
     Parameters
     ----------
-    n : int
-        Size of the SPD matrices (n x n).
-    metric : str, default="AIM"
-        Lie group invariant metric. Supported values are ``"AIM"``, ``"LEM"``,
-        and ``"LCM"``.
+    num_features : int
+        Size of the SPD matrices (:math:`n \times n`).
+    metric : {"AIM", "LEM", "LCM"}, default="AIM"
+        Lie group invariant metric.
     theta : float, default=1.0
-        Power deformation parameter.
+        Power deformation parameter :math:`\theta`.  When
+        :math:`\theta = 1`, no deformation is applied.
     alpha : float, default=1.0
-        Frobenius norm weight in variance computation.
+        Frobenius norm weight :math:`\alpha` in the bi-invariant distance.
     beta : float, default=0.0
-        Trace/logdet weight in variance computation.
+        Trace / log-determinant weight :math:`\beta` in the bi-invariant
+        distance.  Must satisfy :math:`\min(\alpha, \alpha + n\beta) > 0`.
     momentum : float, default=0.1
-        Running statistics momentum.
+        Momentum :math:`\gamma` for exponential moving average of running
+        statistics.
     eps : float, default=1e-5
-        Numerical stability constant for variance normalization.
+        Numerical stability constant :math:`\epsilon` added to the variance
+        before taking the square root.
     n_iter : int, default=1
-        Number of Karcher flow iterations used by the AIM mean.
+        Number of Karcher flow iterations for the AIM Fréchet mean.
+        Ignored by LEM and LCM (which use arithmetic means).
     congruence : {"cholesky", "eig"}, default="cholesky"
         Implementation of the AIM congruence action (centering/biasing).
         ``"cholesky"`` uses the Cholesky factor :math:`L` of :math:`P` to
-        compute :math:`L X L^T` (as in the original LieBN paper).
+        compute :math:`L X L^\top` (as in the original LieBN paper).
         ``"eig"`` uses eigendecomposition-based :math:`M^{-1/2} X M^{-1/2}`
         (matching :func:`~spd_learn.functional.spd_centering`).
         Both are mathematically equivalent; Cholesky is typically faster,
@@ -79,11 +168,56 @@ class SPDBatchNormLie(nn.Module):
         Device on which to create parameters and buffers.
     dtype : torch.dtype, optional
         Data type of parameters and buffers.
+
+    Attributes
+    ----------
+    bias : nn.Parameter
+        Learnable SPD bias matrix :math:`B \in \mathcal{S}_{++}^n`,
+        parametrized via :class:`~spd_learn.modules.SymmetricPositiveDefinite`.
+        Initialized to the identity.
+    shift : nn.Parameter
+        Learnable positive scalar :math:`s > 0`,
+        parametrized via :class:`~spd_learn.modules.PositiveDefiniteScalar`.
+        Initialized to 1.
+    running_mean : torch.Tensor
+        Exponential moving average of the batch Fréchet mean.
+    running_var : torch.Tensor
+        Exponential moving average of the batch variance.
+
+    See Also
+    --------
+    :class:`SPDBatchNormMean` :
+        Mean-only Riemannian batch normalization (AIRM centering without
+        variance normalization) :cite:p:`brooks2019riemannian`.
+    :class:`SPDBatchNormMeanVar` :
+        Full Riemannian batch normalization under the AIRM
+        :cite:p:`kobler2022spd`.
+    :func:`~spd_learn.functional.frechet_mean` :
+        Fréchet mean via Karcher flow (used internally for AIM).
+    :func:`~spd_learn.functional.lie_group_variance` :
+        Bi-invariant Fréchet variance computation.
+
+    References
+    ----------
+    .. bibliography::
+       :filter: key == "chen2024liebn"
+
+    Examples
+    --------
+    >>> import torch
+    >>> from spd_learn.modules import SPDBatchNormLie
+    >>> bn = SPDBatchNormLie(num_features=4, metric="AIM")
+    >>> X = torch.randn(8, 4, 4, dtype=torch.float64)
+    >>> X = X @ X.mT + 0.1 * torch.eye(4, dtype=torch.float64)
+    >>> bn = bn.to(dtype=torch.float64)
+    >>> Y = bn(X)
+    >>> Y.shape
+    torch.Size([8, 4, 4])
     """
 
     def __init__(
         self,
-        n,
+        num_features,
         metric="AIM",
         theta=1.0,
         alpha=1.0,
@@ -105,7 +239,7 @@ def __init__(
             raise ValueError(
                 f"congruence must be 'cholesky' or 'eig', got '{congruence}'"
             )
-        self.n = n
+        self.num_features = num_features
         self.metric = metric
         self.theta = theta
         self.alpha = alpha
@@ -115,18 +249,20 @@ def __init__(
         self.n_iter = n_iter
         self.congruence = congruence
 
-        self.bias = nn.Parameter(torch.empty(1, n, n, device=device, dtype=dtype))
+        self.bias = nn.Parameter(
+            torch.empty(1, num_features, num_features, device=device, dtype=dtype)
+        )
         self.shift = nn.Parameter(torch.empty((), device=device, dtype=dtype))
 
         if metric == "AIM":
             self.register_buffer(
                 "running_mean",
-                torch.eye(n, device=device, dtype=dtype).unsqueeze(0),
+                torch.eye(num_features, device=device, dtype=dtype).unsqueeze(0),
             )
         else:
             self.register_buffer(
                 "running_mean",
-                torch.zeros(1, n, n, device=device, dtype=dtype),
+                torch.zeros(1, num_features, num_features, device=device, dtype=dtype),
             )
         self.register_buffer("running_var", torch.ones((), device=device, dtype=dtype))
 
@@ -238,7 +374,7 @@ def forward(self, X):
 
     def extra_repr(self):
         return (
-            f"n={self.n}, metric={self.metric}, theta={self.theta}, "
-            f"alpha={self.alpha}, beta={self.beta}, momentum={self.momentum}, "
-            f"congruence={self.congruence}"
+            f"num_features={self.num_features}, metric={self.metric}, "
+            f"theta={self.theta}, alpha={self.alpha}, beta={self.beta}, "
+            f"momentum={self.momentum}, congruence={self.congruence}"
         )
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -21,7 +21,7 @@
     "SPDBatchNormMean": dict(num_features=10),
     "BatchReNorm": dict(num_features=10),
     "SPDBatchNormMeanVar": dict(num_features=10),
-    "SPDBatchNormLie": dict(n=10),
+    "SPDBatchNormLie": dict(num_features=10),
     "PatchEmbeddingLayer": dict(n_chans=10, n_patches=2),
     "BiMapIncreaseDim": dict(in_features=10, out_features=20),
     "Shrinkage": dict(n_chans=10),