From 4b8cc2a020aefd991d3c49d695ba2b73cd4e3937 Mon Sep 17 00:00:00 2001
From: Karson Chrispens <33336327+k-chrispens@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:16:38 -0700
Subject: [PATCH] feat(eval): canonicalize mixed-altloc modified residues

---
 src/sampleworks/eval/structure_utils.py | 79 ++++++++++++++++++++++++-
 tests/eval/test_structure_utils.py      | 39 ++++++++++++
 2 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/src/sampleworks/eval/structure_utils.py b/src/sampleworks/eval/structure_utils.py
index ca8cf836..ac2f3c90 100644
--- a/src/sampleworks/eval/structure_utils.py
+++ b/src/sampleworks/eval/structure_utils.py
@@ -1,5 +1,6 @@
 import re
 import traceback
+from collections import defaultdict
 from collections.abc import Sequence
 from dataclasses import dataclass, replace
 from pathlib import Path
@@ -8,6 +9,8 @@
 import numpy as np
 import torch
 from atomworks.io.transforms.atom_array import ensure_atom_array_stack
+from atomworks.io.utils.ccd import ChainType, UNKNOWN_AA
+from atomworks.io.utils.sequence import get_1_from_3_letter_code, get_3_from_1_letter_code
 from biotite.structure import AtomArray, AtomArrayStack, from_template
 from loguru import logger
 from sampleworks.core.rewards.protocol import RewardInputs
@@ -427,6 +430,75 @@ def get_asym_unit_from_structure(
     return atom_array
 
 
+def _closest_canonical_amino_acid(res_name: str) -> str | None:
+    """Map a (possibly modified) residue name to its canonical parent amino acid.
+
+    Uses atomworks' mapping, e.g. ``CSO -> CYS``, ``MSE -> MET``. Standard amino
+    acids map to themselves and "residues" with no amino-acid (ligands, waters) return ``None``.
+
+    Parameters
+    ----------
+    res_name : str
+        Three letter amino acid name.
+
+    Returns
+    -------
+    str | None
+        Canonical three letter amino acid name, or ``None`` if HETATM.
+    """
+    one_letter = get_1_from_3_letter_code(
+        res_name, ChainType.POLYPEPTIDE_L, use_closest_canonical=True
+    )
+    parent = get_3_from_1_letter_code(one_letter, ChainType.POLYPEPTIDE_L)
+    return None if parent == UNKNOWN_AA else parent
+
+
+def canonicalize_mixed_altloc_residues(
+    atom_array: AtomArray | AtomArrayStack,
+) -> AtomArray | AtomArrayStack:
+    """Rename modified amino acids that share a residue position with a canonical residue.
+
+    At residue positions whose altlocs disagree on ``res_name``, such as a
+    canonical amino acid with compositional heterogeneity as a modified form (e.g. CYS with an
+    alternate CSO), we rename the modified records to their canonical
+    amino acid (via :func:`_closest_canonical_amino_acid`) and clear the ``hetero`` flag in the
+    biotite AtomArray annotation. The conformers then share ``res_name``/``hetero``, so
+    :func:`map_altlocs_to_stack` can stack them. Running ``filter_to_common_atoms`` afterwards
+    helps drop modified atoms such as the CSO hydroxyl oxygen.
+
+    Positions with a single, consistent ``res_name`` are left untouched, so cases like a MSE with
+    no compositional heterogeneity are preserved.
+
+    Parameters
+    ----------
+    atom_array : AtomArray | AtomArrayStack
+        Reference structure loaded with altlocs
+
+    Returns
+    -------
+    AtomArray | AtomArrayStack
+        Copy of the input with mixed-altloc modified residues renamed to their closest canonical.
+    """
+    out = atom_array.copy()
+    res_names = out.res_name.tolist()
+    ins_codes = getattr(out, "ins_code", np.full(out.array_length(), "")).tolist()
+    positions = list(zip(out.chain_id.tolist(), out.res_id.tolist(), ins_codes))
+
+    res_names_by_position: dict[tuple, set[str]] = defaultdict(set)
+    for position, res_name in zip(positions, res_names):
+        res_names_by_position[position].add(res_name)
+
+    parent_cache: dict[str, str | None] = {}
+    for i, (position, res_name) in enumerate(zip(positions, res_names)):
+        if len(res_names_by_position[position]) == 1:
+            continue  # single consistent res_name means it doesn't have comp het
+        parent = parent_cache.setdefault(res_name, _closest_canonical_amino_acid(res_name))
+        if parent is not None and parent != res_name:
+            out.res_name[i] = parent
+            out.hetero[i] = False
+    return out
+
+
 def get_reference_atomarraystack(
     protein_config: ProteinConfig, altloc_occupancies: dict[str, float]
 ) -> tuple[Path | str | None, AtomArrayStack | None]:
@@ -443,7 +515,12 @@ def get_reference_atomarraystack(
     ref_path = protein_config.get_reference_structure_path(altloc_occupancies)
     if ref_path is None:
         return None, None
-    ref_struct = load_structure_with_altlocs(ref_path)
+
+    # A modified residue (e.g. CSO) recorded as an altloc at the same position as its canonical
+    # form (e.g. CYS) makes map_altlocs_to_stack fail: the conformers disagree on res_name/hetero
+    # so biotite.stack() rejects them. Canonicalize these residues to get map_altlocs_to_stack to
+    # work
+    ref_struct = canonicalize_mixed_altloc_residues(load_structure_with_altlocs(ref_path))
     if ref_struct.coord is None:
         raise ValueError(f"Unable to load coordinates from {ref_path} Please check file")
     if isinstance(ref_struct, AtomArray):
diff --git a/tests/eval/test_structure_utils.py b/tests/eval/test_structure_utils.py
index 4f0de785..0cae0ce4 100644
--- a/tests/eval/test_structure_utils.py
+++ b/tests/eval/test_structure_utils.py
@@ -8,7 +8,9 @@
 from biotite.structure import AtomArray, AtomArrayStack
 from sampleworks.eval.eval_dataclasses import ProteinConfig
 from sampleworks.eval.structure_utils import (
+    _closest_canonical_amino_acid,
     apply_selection,
+    canonicalize_mixed_altloc_residues,
     extract_selection_coordinates,
     get_asym_unit_from_structure,
     get_reference_atomarraystack,
@@ -343,3 +345,40 @@ def test_with_real_structure(self, resources_dir):
         assert coords.ndim == 2
         assert coords.shape[1] == 3
         assert np.isfinite(coords).all()
+
+
+class TestCanonicalizeMixedAltlocResidues:
+    """Tests for canonicalize_mixed_altloc_residues / _closest_canonical_amino_acid."""
+
+    @staticmethod
+    def _mixed_array() -> AtomArray:
+        # (A,10): canonical CYS + modified CSO compositional heterogeneity
+        # (A,11): MSE only, single res_name, must be left untouched
+        aa = AtomArray(3)
+        aa.coord = np.zeros((3, 3), dtype=np.float32)
+        aa.set_annotation("chain_id", np.array(["A", "A", "A"]))
+        aa.set_annotation("res_id", np.array([10, 10, 11]))
+        aa.set_annotation("res_name", np.array(["CYS", "CSO", "MSE"]))
+        aa.set_annotation("hetero", np.array([False, True, True]))
+        aa.set_annotation("atom_name", np.array(["CA", "CA", "CA"]))
+        return aa
+
+    @pytest.mark.parametrize(
+        "res_name,expected",
+        [("CSO", "CYS"), ("MSE", "MET"), ("ALA", "ALA"), ("HOH", None)],
+    )
+    def test_closest_canonical_amino_acid(self, res_name, expected):
+        assert _closest_canonical_amino_acid(res_name) == expected
+
+    def test_renames_modified_at_mixed_position(self):
+        result = canonicalize_mixed_altloc_residues(self._mixed_array())
+        res_name = cast(np.ndarray, result.res_name)
+        hetero = cast(np.ndarray, result.hetero)
+        assert list(res_name[:2]) == ["CYS", "CYS"]  # CSO -> CYS
+        assert not hetero[0] and not hetero[1]  # hetero cleared
+        assert res_name[2] == "MSE" and hetero[2]  # MSE untouched
+
+    def test_does_not_mutate_input(self):
+        arr = self._mixed_array()
+        canonicalize_mixed_altloc_residues(arr)
+        assert cast(np.ndarray, arr.res_name)[1] == "CSO"