From 4b8cc2a020aefd991d3c49d695ba2b73cd4e3937 Mon Sep 17 00:00:00 2001 From: Karson Chrispens <33336327+k-chrispens@users.noreply.github.com> Date: Wed, 3 Jun 2026 13:16:38 -0700 Subject: [PATCH] feat(eval): canonicalize mixed-altloc modified residues --- src/sampleworks/eval/structure_utils.py | 79 ++++++++++++++++++++++++- tests/eval/test_structure_utils.py | 39 ++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/src/sampleworks/eval/structure_utils.py b/src/sampleworks/eval/structure_utils.py index ca8cf836..ac2f3c90 100644 --- a/src/sampleworks/eval/structure_utils.py +++ b/src/sampleworks/eval/structure_utils.py @@ -1,5 +1,6 @@ import re import traceback +from collections import defaultdict from collections.abc import Sequence from dataclasses import dataclass, replace from pathlib import Path @@ -8,6 +9,8 @@ import numpy as np import torch from atomworks.io.transforms.atom_array import ensure_atom_array_stack +from atomworks.io.utils.ccd import ChainType, UNKNOWN_AA +from atomworks.io.utils.sequence import get_1_from_3_letter_code, get_3_from_1_letter_code from biotite.structure import AtomArray, AtomArrayStack, from_template from loguru import logger from sampleworks.core.rewards.protocol import RewardInputs @@ -427,6 +430,75 @@ def get_asym_unit_from_structure( return atom_array +def _closest_canonical_amino_acid(res_name: str) -> str | None: + """Map a (possibly modified) residue name to its canonical parent amino acid. + + Uses atomworks' mapping, e.g. ``CSO -> CYS``, ``MSE -> MET``. Standard amino + acids map to themselves and "residues" with no amino-acid (ligands, waters) return ``None``. + + Parameters + ---------- + res_name : str + Three letter amino acid name. + + Returns + ------- + str | None + Canonical three letter amino acid name, or ``None`` if HETATM. + """ + one_letter = get_1_from_3_letter_code( + res_name, ChainType.POLYPEPTIDE_L, use_closest_canonical=True + ) + parent = get_3_from_1_letter_code(one_letter, ChainType.POLYPEPTIDE_L) + return None if parent == UNKNOWN_AA else parent + + +def canonicalize_mixed_altloc_residues( + atom_array: AtomArray | AtomArrayStack, +) -> AtomArray | AtomArrayStack: + """Rename modified amino acids that share a residue position with a canonical residue. + + At residue positions whose altlocs disagree on ``res_name``, such as a + canonical amino acid with compositional heterogeneity as a modified form (e.g. CYS with an + alternate CSO), we rename the modified records to their canonical + amino acid (via :func:`_closest_canonical_amino_acid`) and clear the ``hetero`` flag in the + biotite AtomArray annotation. The conformers then share ``res_name``/``hetero``, so + :func:`map_altlocs_to_stack` can stack them. Running ``filter_to_common_atoms`` afterwards + helps drop modified atoms such as the CSO hydroxyl oxygen. + + Positions with a single, consistent ``res_name`` are left untouched, so cases like a MSE with + no compositional heterogeneity are preserved. + + Parameters + ---------- + atom_array : AtomArray | AtomArrayStack + Reference structure loaded with altlocs + + Returns + ------- + AtomArray | AtomArrayStack + Copy of the input with mixed-altloc modified residues renamed to their closest canonical. + """ + out = atom_array.copy() + res_names = out.res_name.tolist() + ins_codes = getattr(out, "ins_code", np.full(out.array_length(), "")).tolist() + positions = list(zip(out.chain_id.tolist(), out.res_id.tolist(), ins_codes)) + + res_names_by_position: dict[tuple, set[str]] = defaultdict(set) + for position, res_name in zip(positions, res_names): + res_names_by_position[position].add(res_name) + + parent_cache: dict[str, str | None] = {} + for i, (position, res_name) in enumerate(zip(positions, res_names)): + if len(res_names_by_position[position]) == 1: + continue # single consistent res_name means it doesn't have comp het + parent = parent_cache.setdefault(res_name, _closest_canonical_amino_acid(res_name)) + if parent is not None and parent != res_name: + out.res_name[i] = parent + out.hetero[i] = False + return out + + def get_reference_atomarraystack( protein_config: ProteinConfig, altloc_occupancies: dict[str, float] ) -> tuple[Path | str | None, AtomArrayStack | None]: @@ -443,7 +515,12 @@ def get_reference_atomarraystack( ref_path = protein_config.get_reference_structure_path(altloc_occupancies) if ref_path is None: return None, None - ref_struct = load_structure_with_altlocs(ref_path) + + # A modified residue (e.g. CSO) recorded as an altloc at the same position as its canonical + # form (e.g. CYS) makes map_altlocs_to_stack fail: the conformers disagree on res_name/hetero + # so biotite.stack() rejects them. Canonicalize these residues to get map_altlocs_to_stack to + # work + ref_struct = canonicalize_mixed_altloc_residues(load_structure_with_altlocs(ref_path)) if ref_struct.coord is None: raise ValueError(f"Unable to load coordinates from {ref_path} Please check file") if isinstance(ref_struct, AtomArray): diff --git a/tests/eval/test_structure_utils.py b/tests/eval/test_structure_utils.py index 4f0de785..0cae0ce4 100644 --- a/tests/eval/test_structure_utils.py +++ b/tests/eval/test_structure_utils.py @@ -8,7 +8,9 @@ from biotite.structure import AtomArray, AtomArrayStack from sampleworks.eval.eval_dataclasses import ProteinConfig from sampleworks.eval.structure_utils import ( + _closest_canonical_amino_acid, apply_selection, + canonicalize_mixed_altloc_residues, extract_selection_coordinates, get_asym_unit_from_structure, get_reference_atomarraystack, @@ -343,3 +345,40 @@ def test_with_real_structure(self, resources_dir): assert coords.ndim == 2 assert coords.shape[1] == 3 assert np.isfinite(coords).all() + + +class TestCanonicalizeMixedAltlocResidues: + """Tests for canonicalize_mixed_altloc_residues / _closest_canonical_amino_acid.""" + + @staticmethod + def _mixed_array() -> AtomArray: + # (A,10): canonical CYS + modified CSO compositional heterogeneity + # (A,11): MSE only, single res_name, must be left untouched + aa = AtomArray(3) + aa.coord = np.zeros((3, 3), dtype=np.float32) + aa.set_annotation("chain_id", np.array(["A", "A", "A"])) + aa.set_annotation("res_id", np.array([10, 10, 11])) + aa.set_annotation("res_name", np.array(["CYS", "CSO", "MSE"])) + aa.set_annotation("hetero", np.array([False, True, True])) + aa.set_annotation("atom_name", np.array(["CA", "CA", "CA"])) + return aa + + @pytest.mark.parametrize( + "res_name,expected", + [("CSO", "CYS"), ("MSE", "MET"), ("ALA", "ALA"), ("HOH", None)], + ) + def test_closest_canonical_amino_acid(self, res_name, expected): + assert _closest_canonical_amino_acid(res_name) == expected + + def test_renames_modified_at_mixed_position(self): + result = canonicalize_mixed_altloc_residues(self._mixed_array()) + res_name = cast(np.ndarray, result.res_name) + hetero = cast(np.ndarray, result.hetero) + assert list(res_name[:2]) == ["CYS", "CYS"] # CSO -> CYS + assert not hetero[0] and not hetero[1] # hetero cleared + assert res_name[2] == "MSE" and hetero[2] # MSE untouched + + def test_does_not_mutate_input(self): + arr = self._mixed_array() + canonicalize_mixed_altloc_residues(arr) + assert cast(np.ndarray, arr.res_name)[1] == "CSO"