-
Notifications
You must be signed in to change notification settings - Fork 7
feat(eval): canonicalize mixed-altloc modified residues #263
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import re | ||
| import traceback | ||
| from collections import defaultdict | ||
| from collections.abc import Sequence | ||
| from dataclasses import dataclass, replace | ||
| from pathlib import Path | ||
|
|
@@ -8,6 +9,8 @@ | |
| import numpy as np | ||
| import torch | ||
| from atomworks.io.transforms.atom_array import ensure_atom_array_stack | ||
| from atomworks.io.utils.ccd import ChainType, UNKNOWN_AA | ||
| from atomworks.io.utils.sequence import get_1_from_3_letter_code, get_3_from_1_letter_code | ||
| from biotite.structure import AtomArray, AtomArrayStack, from_template | ||
| from loguru import logger | ||
| from sampleworks.core.rewards.protocol import RewardInputs | ||
|
|
@@ -427,6 +430,75 @@ def get_asym_unit_from_structure( | |
| return atom_array | ||
|
|
||
|
|
||
| def _closest_canonical_amino_acid(res_name: str) -> str | None: | ||
| """Map a (possibly modified) residue name to its canonical parent amino acid. | ||
|
|
||
| Uses atomworks' mapping, e.g. ``CSO -> CYS``, ``MSE -> MET``. Standard amino | ||
| acids map to themselves and "residues" with no amino-acid (ligands, waters) return ``None``. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| res_name : str | ||
| Three letter amino acid name. | ||
|
|
||
| Returns | ||
| ------- | ||
| str | None | ||
| Canonical three letter amino acid name, or ``None`` if HETATM. | ||
| """ | ||
| one_letter = get_1_from_3_letter_code( | ||
| res_name, ChainType.POLYPEPTIDE_L, use_closest_canonical=True | ||
| ) | ||
| parent = get_3_from_1_letter_code(one_letter, ChainType.POLYPEPTIDE_L) | ||
| return None if parent == UNKNOWN_AA else parent | ||
|
|
||
|
|
||
| def canonicalize_mixed_altloc_residues( | ||
| atom_array: AtomArray | AtomArrayStack, | ||
| ) -> AtomArray | AtomArrayStack: | ||
| """Rename modified amino acids that share a residue position with a canonical residue. | ||
|
|
||
| At residue positions whose altlocs disagree on ``res_name``, such as a | ||
| canonical amino acid with compositional heterogeneity as a modified form (e.g. CYS with an | ||
| alternate CSO), we rename the modified records to their canonical | ||
| amino acid (via :func:`_closest_canonical_amino_acid`) and clear the ``hetero`` flag in the | ||
| biotite AtomArray annotation. The conformers then share ``res_name``/``hetero``, so | ||
| :func:`map_altlocs_to_stack` can stack them. Running ``filter_to_common_atoms`` afterwards | ||
| helps drop modified atoms such as the CSO hydroxyl oxygen. | ||
|
|
||
| Positions with a single, consistent ``res_name`` are left untouched, so cases like a MSE with | ||
| no compositional heterogeneity are preserved. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| atom_array : AtomArray | AtomArrayStack | ||
| Reference structure loaded with altlocs | ||
|
|
||
| Returns | ||
| ------- | ||
| AtomArray | AtomArrayStack | ||
| Copy of the input with mixed-altloc modified residues renamed to their closest canonical. | ||
| """ | ||
| out = atom_array.copy() | ||
| res_names = out.res_name.tolist() | ||
| ins_codes = getattr(out, "ins_code", np.full(out.array_length(), "")).tolist() | ||
| positions = list(zip(out.chain_id.tolist(), out.res_id.tolist(), ins_codes)) | ||
|
|
||
| res_names_by_position: dict[tuple, set[str]] = defaultdict(set) | ||
| for position, res_name in zip(positions, res_names): | ||
| res_names_by_position[position].add(res_name) | ||
|
|
||
| parent_cache: dict[str, str | None] = {} | ||
| for i, (position, res_name) in enumerate(zip(positions, res_names)): | ||
| if len(res_names_by_position[position]) == 1: | ||
| continue # single consistent res_name means it doesn't have comp het | ||
| parent = parent_cache.setdefault(res_name, _closest_canonical_amino_acid(res_name)) | ||
| if parent is not None and parent != res_name: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you trying to canonicalize all residues, or just make the sequences the same? What if a structure has two different non-canonicals at the same position, and no canonicals? |
||
| out.res_name[i] = parent | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if the two residues don't have the same atoms? I see a couple issues here. One is that AtomWorks' parse will insert extra atoms when there's compositional heterogeneity. So, for example in 6NI6, you would end up with two CA, C, O, etc... as well as extra atoms that aren't compatible with the residue type anymore. I think probably we need to handle that and canonicalize the atoms, not just the residue names. |
||
| out.hetero[i] = False | ||
|
k-chrispens marked this conversation as resolved.
|
||
| return out | ||
|
|
||
|
|
||
| def get_reference_atomarraystack( | ||
| protein_config: ProteinConfig, altloc_occupancies: dict[str, float] | ||
| ) -> tuple[Path | str | None, AtomArrayStack | None]: | ||
|
|
@@ -443,7 +515,12 @@ def get_reference_atomarraystack( | |
| ref_path = protein_config.get_reference_structure_path(altloc_occupancies) | ||
| if ref_path is None: | ||
| return None, None | ||
| ref_struct = load_structure_with_altlocs(ref_path) | ||
|
|
||
| # A modified residue (e.g. CSO) recorded as an altloc at the same position as its canonical | ||
| # form (e.g. CYS) makes map_altlocs_to_stack fail: the conformers disagree on res_name/hetero | ||
| # so biotite.stack() rejects them. Canonicalize these residues to get map_altlocs_to_stack to | ||
| # work | ||
| ref_struct = canonicalize_mixed_altloc_residues(load_structure_with_altlocs(ref_path)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think looking at your code, you should try to replace my method with yours although I'm pretty sure you need to get rid of extra atoms still.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you don't have time, make an issue and tag with engineering. |
||
| if ref_struct.coord is None: | ||
| raise ValueError(f"Unable to load coordinates from {ref_path} Please check file") | ||
| if isinstance(ref_struct, AtomArray): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,9 @@ | |
| from biotite.structure import AtomArray, AtomArrayStack | ||
| from sampleworks.eval.eval_dataclasses import ProteinConfig | ||
| from sampleworks.eval.structure_utils import ( | ||
| _closest_canonical_amino_acid, | ||
| apply_selection, | ||
| canonicalize_mixed_altloc_residues, | ||
| extract_selection_coordinates, | ||
| get_asym_unit_from_structure, | ||
| get_reference_atomarraystack, | ||
|
|
@@ -343,3 +345,40 @@ def test_with_real_structure(self, resources_dir): | |
| assert coords.ndim == 2 | ||
| assert coords.shape[1] == 3 | ||
| assert np.isfinite(coords).all() | ||
|
|
||
|
|
||
| class TestCanonicalizeMixedAltlocResidues: | ||
| """Tests for canonicalize_mixed_altloc_residues / _closest_canonical_amino_acid.""" | ||
|
|
||
| @staticmethod | ||
| def _mixed_array() -> AtomArray: | ||
| # (A,10): canonical CYS + modified CSO compositional heterogeneity | ||
|
Comment on lines
+353
to
+355
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add NumPy-style docstrings to the new test methods. The new methods in this block are missing docstrings; this repo requires NumPy-style docstrings for every function and class. As per coding guidelines, "Always include NumPy-style docstrings for every function and class." Also applies to: 370-384 🤖 Prompt for AI AgentsSource: Coding guidelines |
||
| # (A,11): MSE only, single res_name, must be left untouched | ||
| aa = AtomArray(3) | ||
| aa.coord = np.zeros((3, 3), dtype=np.float32) | ||
| aa.set_annotation("chain_id", np.array(["A", "A", "A"])) | ||
| aa.set_annotation("res_id", np.array([10, 10, 11])) | ||
| aa.set_annotation("res_name", np.array(["CYS", "CSO", "MSE"])) | ||
| aa.set_annotation("hetero", np.array([False, True, True])) | ||
| aa.set_annotation("atom_name", np.array(["CA", "CA", "CA"])) | ||
| return aa | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "res_name,expected", | ||
| [("CSO", "CYS"), ("MSE", "MET"), ("ALA", "ALA"), ("HOH", None)], | ||
| ) | ||
| def test_closest_canonical_amino_acid(self, res_name, expected): | ||
| assert _closest_canonical_amino_acid(res_name) == expected | ||
|
|
||
| def test_renames_modified_at_mixed_position(self): | ||
| result = canonicalize_mixed_altloc_residues(self._mixed_array()) | ||
| res_name = cast(np.ndarray, result.res_name) | ||
| hetero = cast(np.ndarray, result.hetero) | ||
| assert list(res_name[:2]) == ["CYS", "CYS"] # CSO -> CYS | ||
| assert not hetero[0] and not hetero[1] # hetero cleared | ||
| assert res_name[2] == "MSE" and hetero[2] # MSE untouched | ||
|
|
||
| def test_does_not_mutate_input(self): | ||
| arr = self._mixed_array() | ||
| canonicalize_mixed_altloc_residues(arr) | ||
| assert cast(np.ndarray, arr.res_name)[1] == "CSO" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this solve the same problem as
resolve_mixed_hetatm_atom_altlocs? If so, IIRC AtomWorks inserts an extra residue, and it looks to me like this might not catch that.Also, if this does solve that same problem, I'd rather we have one solution, not two which could diverge. I'm not saying my solution was better (it is a bit of a hack, tbh, to have to write a temporary CIF file). But before we put this in, we should unify the two (again, if they are the same)