Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion src/sampleworks/eval/structure_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import traceback
from collections import defaultdict
from collections.abc import Sequence
from dataclasses import dataclass, replace
from pathlib import Path
Expand All @@ -8,6 +9,8 @@
import numpy as np
import torch
from atomworks.io.transforms.atom_array import ensure_atom_array_stack
from atomworks.io.utils.ccd import ChainType, UNKNOWN_AA
from atomworks.io.utils.sequence import get_1_from_3_letter_code, get_3_from_1_letter_code
from biotite.structure import AtomArray, AtomArrayStack, from_template
from loguru import logger
from sampleworks.core.rewards.protocol import RewardInputs
Expand Down Expand Up @@ -427,6 +430,75 @@ def get_asym_unit_from_structure(
return atom_array


def _closest_canonical_amino_acid(res_name: str) -> str | None:
"""Map a (possibly modified) residue name to its canonical parent amino acid.

Uses atomworks' mapping, e.g. ``CSO -> CYS``, ``MSE -> MET``. Standard amino
acids map to themselves and "residues" with no amino-acid (ligands, waters) return ``None``.

Parameters
----------
res_name : str
Three letter amino acid name.

Returns
-------
str | None
Canonical three letter amino acid name, or ``None`` if HETATM.
"""
one_letter = get_1_from_3_letter_code(
res_name, ChainType.POLYPEPTIDE_L, use_closest_canonical=True
)
parent = get_3_from_1_letter_code(one_letter, ChainType.POLYPEPTIDE_L)
return None if parent == UNKNOWN_AA else parent


def canonicalize_mixed_altloc_residues(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this solve the same problem as resolve_mixed_hetatm_atom_altlocs? If so, IIRC AtomWorks inserts an extra residue, and it looks to me like this might not catch that.

Also, if this does solve that same problem, I'd rather we have one solution, not two which could diverge. I'm not saying my solution was better (it is a bit of a hack, tbh, to have to write a temporary CIF file). But before we put this in, we should unify the two (again, if they are the same)

atom_array: AtomArray | AtomArrayStack,
) -> AtomArray | AtomArrayStack:
"""Rename modified amino acids that share a residue position with a canonical residue.

At residue positions whose altlocs disagree on ``res_name``, such as a
canonical amino acid with compositional heterogeneity as a modified form (e.g. CYS with an
alternate CSO), we rename the modified records to their canonical
amino acid (via :func:`_closest_canonical_amino_acid`) and clear the ``hetero`` flag in the
biotite AtomArray annotation. The conformers then share ``res_name``/``hetero``, so
:func:`map_altlocs_to_stack` can stack them. Running ``filter_to_common_atoms`` afterwards
helps drop modified atoms such as the CSO hydroxyl oxygen.

Positions with a single, consistent ``res_name`` are left untouched, so cases like a MSE with
no compositional heterogeneity are preserved.

Parameters
----------
atom_array : AtomArray | AtomArrayStack
Reference structure loaded with altlocs

Returns
-------
AtomArray | AtomArrayStack
Copy of the input with mixed-altloc modified residues renamed to their closest canonical.
"""
out = atom_array.copy()
res_names = out.res_name.tolist()
ins_codes = getattr(out, "ins_code", np.full(out.array_length(), "")).tolist()
positions = list(zip(out.chain_id.tolist(), out.res_id.tolist(), ins_codes))

res_names_by_position: dict[tuple, set[str]] = defaultdict(set)
for position, res_name in zip(positions, res_names):
res_names_by_position[position].add(res_name)

parent_cache: dict[str, str | None] = {}
for i, (position, res_name) in enumerate(zip(positions, res_names)):
if len(res_names_by_position[position]) == 1:
continue # single consistent res_name means it doesn't have comp het
parent = parent_cache.setdefault(res_name, _closest_canonical_amino_acid(res_name))
if parent is not None and parent != res_name:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you trying to canonicalize all residues, or just make the sequences the same? What if a structure has two different non-canonicals at the same position, and no canonicals?

out.res_name[i] = parent

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if the two residues don't have the same atoms? I see a couple issues here. One is that AtomWorks' parse will insert extra atoms when there's compositional heterogeneity. So, for example in 6NI6, you would end up with two CA, C, O, etc... as well as extra atoms that aren't compatible with the residue type anymore. I think probably we need to handle that and canonicalize the atoms, not just the residue names.

out.hetero[i] = False
Comment thread
k-chrispens marked this conversation as resolved.
return out


def get_reference_atomarraystack(
protein_config: ProteinConfig, altloc_occupancies: dict[str, float]
) -> tuple[Path | str | None, AtomArrayStack | None]:
Expand All @@ -443,7 +515,12 @@ def get_reference_atomarraystack(
ref_path = protein_config.get_reference_structure_path(altloc_occupancies)
if ref_path is None:
return None, None
ref_struct = load_structure_with_altlocs(ref_path)

# A modified residue (e.g. CSO) recorded as an altloc at the same position as its canonical
# form (e.g. CYS) makes map_altlocs_to_stack fail: the conformers disagree on res_name/hetero
# so biotite.stack() rejects them. Canonicalize these residues to get map_altlocs_to_stack to
# work
ref_struct = canonicalize_mixed_altloc_residues(load_structure_with_altlocs(ref_path))

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think looking at your code, you should try to replace my method with yours although I'm pretty sure you need to get rid of extra atoms still.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't have time, make an issue and tag with engineering.

if ref_struct.coord is None:
raise ValueError(f"Unable to load coordinates from {ref_path} Please check file")
if isinstance(ref_struct, AtomArray):
Expand Down
39 changes: 39 additions & 0 deletions tests/eval/test_structure_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from biotite.structure import AtomArray, AtomArrayStack
from sampleworks.eval.eval_dataclasses import ProteinConfig
from sampleworks.eval.structure_utils import (
_closest_canonical_amino_acid,
apply_selection,
canonicalize_mixed_altloc_residues,
extract_selection_coordinates,
get_asym_unit_from_structure,
get_reference_atomarraystack,
Expand Down Expand Up @@ -343,3 +345,40 @@ def test_with_real_structure(self, resources_dir):
assert coords.ndim == 2
assert coords.shape[1] == 3
assert np.isfinite(coords).all()


class TestCanonicalizeMixedAltlocResidues:
"""Tests for canonicalize_mixed_altloc_residues / _closest_canonical_amino_acid."""

@staticmethod
def _mixed_array() -> AtomArray:
# (A,10): canonical CYS + modified CSO compositional heterogeneity
Comment on lines +353 to +355

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Add NumPy-style docstrings to the new test methods.

The new methods in this block are missing docstrings; this repo requires NumPy-style docstrings for every function and class. As per coding guidelines, "Always include NumPy-style docstrings for every function and class."

Also applies to: 370-384

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/eval/test_structure_utils.py` around lines 353 - 355, Add NumPy-style
docstrings to the test methods in the specified ranges. For the _mixed_array()
method and all other new test methods in the affected ranges, add docstrings
that follow NumPy style, which should include a brief summary of what the method
does, a Returns section describing what is returned, and any other relevant
sections as needed. Each docstring should be placed immediately after the method
signature and before any implementation code or comments.

Source: Coding guidelines

# (A,11): MSE only, single res_name, must be left untouched
aa = AtomArray(3)
aa.coord = np.zeros((3, 3), dtype=np.float32)
aa.set_annotation("chain_id", np.array(["A", "A", "A"]))
aa.set_annotation("res_id", np.array([10, 10, 11]))
aa.set_annotation("res_name", np.array(["CYS", "CSO", "MSE"]))
aa.set_annotation("hetero", np.array([False, True, True]))
aa.set_annotation("atom_name", np.array(["CA", "CA", "CA"]))
return aa

@pytest.mark.parametrize(
"res_name,expected",
[("CSO", "CYS"), ("MSE", "MET"), ("ALA", "ALA"), ("HOH", None)],
)
def test_closest_canonical_amino_acid(self, res_name, expected):
assert _closest_canonical_amino_acid(res_name) == expected

def test_renames_modified_at_mixed_position(self):
result = canonicalize_mixed_altloc_residues(self._mixed_array())
res_name = cast(np.ndarray, result.res_name)
hetero = cast(np.ndarray, result.hetero)
assert list(res_name[:2]) == ["CYS", "CYS"] # CSO -> CYS
assert not hetero[0] and not hetero[1] # hetero cleared
assert res_name[2] == "MSE" and hetero[2] # MSE untouched

def test_does_not_mutate_input(self):
arr = self._mixed_array()
canonicalize_mixed_altloc_residues(arr)
assert cast(np.ndarray, arr.res_name)[1] == "CSO"
Loading