Skip to content

Commit bdc6897

Browse files
authored
Merge pull request #76 from lmiq/new_select
selection syntax to support parenthesis and "or" shortcut
2 parents b3d4fd0 + 8d20562 commit bdc6897

3 files changed

Lines changed: 261 additions & 35 deletions

File tree

docs/src/documentation.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,21 @@ julia> collectresidues(struc, sel"acidic")
281281
Residue 63:H with name GLU, 9 atoms
282282
```
283283

284+
Also, the selection syntax supports parenthesis and repeating the possible property
285+
values as a shortcut for multiple `or` clauses:
286+
287+
```julia-repl
288+
julia> collectatoms(struc, sel"resname ARG GLU and (element C N)")
289+
1320-element Vector{AbstractAtom}:
290+
Atom N with serial 9, coordinates [19.33, 32.429, -28.593]
291+
Atom CA with serial 10, coordinates [20.769, 32.605, -28.801]
292+
Atom C with serial 11, coordinates [21.503, 32.581, -27.478]
293+
294+
Atom CZ with serial 11676, coordinates [5.525, 1.346, -44.697]
295+
Atom NH1 with serial 11677, coordinates [4.626, 1.502, -45.668]
296+
Atom NH2 with serial 11678, coordinates [5.206, 0.618, -43.643]
297+
```
298+
284299
The operators currently supported are:
285300

286301
| Operators | Acts on |

src/select.jl

Lines changed: 204 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -515,12 +515,9 @@ function parse_to_type(key::Keyword, val)
515515
elseif key.value_type == Char && length(val) == 1
516516
return val[1]
517517
end
518-
try
519-
val = parse(key.value_type, val)
520-
return val
521-
catch
522-
throw(ArgumentError("Could not parse $val for keyword $(key.name), expected $(key.value_type)"))
523-
end
518+
val = tryparse(key.value_type, val)
519+
isnothing(val) && throw(ArgumentError("Could not parse $val for keyword $(key.name), expected $(key.value_type)"))
520+
return val
524521
end
525522

526523
const keywords = [
@@ -545,6 +542,9 @@ const keywords = [
545542
Keyword(Float64, "x" , x , operators),
546543
Keyword(Float64, "y" , y , operators),
547544
Keyword(Float64, "z" , z , operators),
545+
]
546+
547+
const macro_keywords = [
548548
MacroKeyword("standard" , standardselector),
549549
MacroKeyword("hetero" , heteroselector),
550550
MacroKeyword("backbone" , backboneselector),
@@ -569,37 +569,209 @@ const keywords = [
569569
MacroKeyword("all" , allselector),
570570
]
571571

572-
# See https://discourse.julialang.org/t/parsing-selection-syntax/43632/9
573-
parse_query(selection::AbstractString) = parse_query_vector(split(selection))
574-
575-
function parse_query_vector(s::AbstractVector{<:AbstractString})
576-
# or, and, not
577-
if (i = findfirst(==("or"), s)) !== nothing
578-
deleteat!(s, i)
579-
return (|, parse_query_vector.((s[1:i-1], s[i:end]))...)
580-
elseif (i = findfirst(==("and"), s)) !== nothing
581-
deleteat!(s, i)
582-
return (&, parse_query_vector.((s[1:i-1], s[i:end]))...)
583-
elseif (i = findfirst(==("not"), s)) !== nothing
584-
deleteat!(s, i)
585-
return (!, parse_query_vector(s[i:end]))
586-
# Keywords
587-
else
588-
for key in keywords
589-
if (i = findfirst(==(key.name), s)) !== nothing
590-
deleteat!(s, i)
591-
return key(s)
592-
end
593-
end
594-
throw(ArgumentError(("Unable to parse selection string: $s")))
595-
end
572+
#=
573+
parse_query(selection:String)
574+
575+
Calls `parse_query_vector` after splitting the selection string.
576+
577+
=#
578+
function parse_query(selection::String)
579+
s = replace(selection, "(" => " ( ", ")" => " ) ")
580+
return parse_query_vector(split(s))
596581
end
597582

598583
function apply_query(q, a)
599584
if !(q isa Tuple)
600-
return q(a)
585+
q(a)
601586
else
602587
f, args = Iterators.peel(q)
603-
return f(apply_query.(args, Ref(a))...)
588+
f(apply_query.(args, Ref(a))...)
589+
end
590+
end
591+
592+
parse_error(str) = throw(ArgumentError(str))
593+
594+
#
595+
# Obs: the following code were generated by Gemini 2.5-Pro, with modifications,
596+
# and then tested.
597+
#
598+
599+
# New helper functions
600+
function is_operator(token::AbstractString)
601+
return token == "and" || token == "or" || token == "not"
602+
end
603+
604+
function is_fully_enclosed(tokens::AbstractVector{<:AbstractString})
605+
level = 0
606+
# Check if the first '(' matches the last ')' without level becoming zero in between
607+
# for any token except the last one.
608+
for i in firstindex(tokens):(lastindex(tokens)-1)
609+
if tokens[i] == "("
610+
level += 1
611+
elseif tokens[i] == ")"
612+
level -= 1
613+
if level == 0 # Closed too early, means not fully enclosed by the outermost pair
614+
return false
615+
end
616+
end
617+
end
618+
# After iterating up to tokens[end-1], level should be 1 if tokens[begin] was '('
619+
# and it correctly matches tokens[end]. If level is not 1, it means mismatched parentheses within.
620+
return level == 1
621+
end
622+
623+
function find_operator_at_level_zero(op_str::String, tokens::AbstractVector{<:AbstractString})
624+
level = 0
625+
# Find first occurrence from left to right (maintaining current style)
626+
for i in eachindex(tokens)
627+
if tokens[i] == "("
628+
level += 1
629+
elseif tokens[i] == ")"
630+
level -= 1
631+
if level < 0
632+
parse_error("Mismatched parentheses: too many closing parentheses.")
633+
end
634+
elseif tokens[i] == op_str && level == 0
635+
return i
636+
end
637+
end
638+
if level != 0
639+
parse_error("Mismatched parentheses: not enough closing parentheses.")
604640
end
641+
return 0 # Not found at level zero
605642
end
643+
644+
# Modified parse_query_vector
645+
function parse_query_vector(s_vec_const::AbstractVector{<:AbstractString})
646+
s_vec = s_vec_const # Operate on slices or copies, not modifying original array passed around
647+
if isempty(s_vec)
648+
parse_error("Empty query segment.")
649+
end
650+
651+
# Handle expressions fully enclosed in matching parentheses
652+
# e.g. "(A and B)" should be parsed by parsing "A and B"
653+
temp_s_vec = s_vec # Use a temporary variable for iterative stripping
654+
while length(temp_s_vec) > 1 && temp_s_vec[begin] == "(" && temp_s_vec[end] == ")" && is_fully_enclosed(temp_s_vec)
655+
temp_s_vec = temp_s_vec[begin+1:end-1]
656+
if isempty(temp_s_vec)
657+
parse_error("Empty parentheses in query: '()'")
658+
end
659+
end
660+
s_vec = temp_s_vec # Assign the stripped version back
661+
662+
# Operator precedence: OR, then AND, then NOT (as in original code for splitting)
663+
# Find 'or' not within parentheses
664+
if (i = find_operator_at_level_zero("or", s_vec)) > 0
665+
left_tokens = s_vec[begin:i-1]
666+
right_tokens = s_vec[i+1:end]
667+
if isempty(left_tokens) || isempty(right_tokens)
668+
parse_error("Syntax error near 'or'. Missing operand.")
669+
end
670+
return (|, parse_query_vector(left_tokens), parse_query_vector(right_tokens))
671+
672+
elseif (i = find_operator_at_level_zero("and", s_vec)) > 0
673+
left_tokens = s_vec[begin:i-1]
674+
right_tokens = s_vec[i+1:end]
675+
if isempty(left_tokens) || isempty(right_tokens)
676+
parse_error("Syntax error near 'and'. Missing operand.")
677+
end
678+
return (&, parse_query_vector(left_tokens), parse_query_vector(right_tokens))
679+
680+
elseif s_vec[begin] == "not"
681+
if length(s_vec) == 1
682+
parse_error("Syntax error near 'not'. Missing operand.")
683+
end
684+
remaining_tokens = s_vec[begin+1:end]
685+
if isempty(remaining_tokens) # Should be caught by length check, but defensive
686+
parse_error("Syntax error near 'not'. Missing operand.")
687+
end
688+
# Prevent "not and", "not or", "not not" if "not" is not a general prefix operator in this DSL
689+
if is_operator(remaining_tokens[begin]) && remaining_tokens[begin] != "not" # allow "not not" if desired, though unusual
690+
parse_error("Operator '$(remaining_tokens[begin])' cannot directly follow 'not'.")
691+
end
692+
return (!, parse_query_vector(remaining_tokens))
693+
694+
# Base case: No top-level logical operators. Must be a keyword phrase.
695+
else
696+
token_keyword_name = s_vec[begin]
697+
698+
# Standard Keywords (e.g., "name", "resnum", "index")
699+
for key_obj in keywords # key_obj is of type Keyword
700+
if token_keyword_name == key_obj.name
701+
if length(s_vec) == 1 # Keyword name token only, no arguments
702+
parse_error("Keyword '$(key_obj.name)' requires at least one argument.")
703+
end
704+
705+
keyword_args = s_vec[begin+1:end] # Arguments following the keyword name
706+
707+
is_operator_syntax_match = false
708+
if !isempty(keyword_args)
709+
first_arg = keyword_args[1]
710+
for op_tuple in key_obj.operators # e.g., ("<", isless)
711+
operator_string = op_tuple[1]
712+
if first_arg == operator_string
713+
# Expected form: "keyword operator value", so keyword_args should be ["operator", "value"] (length 2)
714+
if length(keyword_args) == 2
715+
is_operator_syntax_match = true
716+
else
717+
parse_error(
718+
"Malformed operator expression for keyword '$(key_obj.name)'. "*
719+
"Expected 'keyword $operator_string value'. Got: $(join(s_vec, " "))"
720+
)
721+
end
722+
break # Operator string found and processed
723+
end
724+
end
725+
end
726+
727+
if is_operator_syntax_match
728+
# Case: "keyword operator value", e.g., "resnum < 13"
729+
# keyword_args will be ["<", "13"]. The Keyword functor handles this structure.
730+
return key_obj(keyword_args)
731+
else
732+
# Sanity check for multi-value: ensure no operators are present in the value list.
733+
# E.g. "resnum 10 < 20" is an error here because "10" is not an operator,
734+
# but "<" appears later in a context expecting only values.
735+
for arg_val in keyword_args
736+
for op_tuple in key_obj.operators
737+
if arg_val == op_tuple[1] # op_tuple[1] is the operator string
738+
parse_error(
739+
"Syntax error for keyword '$(key_obj.name)'. Operator '$(op_tuple[1])' found in an unexpected position. "*
740+
"Arguments: $(join(keyword_args, " ")). Operator expressions must be 'keyword $(op_tuple[1]) value'."
741+
)
742+
end
743+
end
744+
end
745+
746+
# Proceed with implicit equality (single value or multi-value OR).
747+
if length(keyword_args) == 1
748+
# e.g., "name CA" -> keyword_args = ["CA"]
749+
# The Keyword functor handles this as implicit equality.
750+
return key_obj(keyword_args)
751+
else
752+
# Multi-value implicit OR case, e.g., "resname ARG GLU ASP"
753+
# keyword_args = ["ARG", "GLU", "ASP"]
754+
current_expr_tree = key_obj([keyword_args[end]]) # Process the last value
755+
for k_idx in (length(keyword_args)-1):-1:firstindex(keyword_args) # Iterate remaining values
756+
current_expr_tree = (|, key_obj([keyword_args[k_idx]]), current_expr_tree)
757+
end
758+
return current_expr_tree
759+
end
760+
end
761+
end
762+
end
763+
764+
# Macro Keywords (e.g., "protein", "water")
765+
for key_obj in macro_keywords
766+
if token_keyword_name == key_obj.name
767+
if length(s_vec) > 1
768+
parse_error("Macro keyword '$(key_obj.name)' does not take arguments. Unexpected tokens: $(join(s_vec[begin+1:end], " "))")
769+
end
770+
# MacroKeyword functor expects an argument list (empty for macros)
771+
return key_obj(String[])
772+
end
773+
end
774+
775+
parse_error("Unknown keyword or invalid syntax at: '$(join(s_vec, " "))'")
776+
end
777+
end

test/runtests.jl

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,7 @@ end
928928
@test length(collectatoms(struc, sel"disordered")) == 68
929929
@test length(collectatoms(struc, sel"sscode E")) == 2448
930930
@test length(collectatoms(struc, sel"helix")) == 4047
931+
931932
# Check interpolation support
932933
ss_type = "helix"
933934
@test length(collectatoms(struc, sel"$ss_type")) == 4047
@@ -942,14 +943,52 @@ end
942943
@test length(collectmodels(struc, sel"model 1")) == 1
943944
@test length(collectmodels(struc, sel"model 2")) == 0
944945

945-
@test_throws ArgumentError collectatoms(struc, BioStructures.Select("abc")) # Invalid selection syntax
946-
@test_throws ArgumentError collectatoms(struc, BioStructures.Select("index = A")) # Invalid value type
947-
@test_throws ArgumentError collectatoms(struc, BioStructures.Select("resnum C"))
946+
# Check complicated selections
947+
@test length(collectatoms(struc, sel"name CA and (resnum < 15 or resnum > 16)")) == 1404
948+
@test length(collectatoms(struc, sel"protein and (chain A and resnum < 130)")) == 1050
949+
@test length(collectatoms(struc, sel"chain A and resnum 132")) == 9 # with alternate locations
950+
@test length(collectatoms(struc, sel"(not protein) and (resname HOH or (resname SOD and index < 600))")) == 639
951+
@test length(collectatoms(struc, sel"not protein and not water or (chain A and resnum < 10)")) == 79
952+
@test length(collectatoms(struc, sel"not protein and not water or (chain A and resnum <= 10)")) == 87
953+
@test length(collectatoms(struc, sel"name CA and resname ALA ARG GLU")) == 224
954+
@test length(collectatoms(struc, sel"resname ALA ARG GLU and name N")) == 224
955+
@test length(collectatoms(struc, sel"(resname ALA ARG GLU) and (name N or name CA)")) == 448
956+
@test length(collectatoms(struc, sel"index 2 3 4 5")) == 4
957+
@test length(collectatoms(struc, sel"element C")) == 7508
958+
@test length(collectatoms(struc, sel"element C N")) == 9468
959+
@test length(collectatoms(struc, sel"not protein and element C N")) == 0
960+
@test length(collectatoms(struc, sel"not protein and element O H ")) == 639
948961

949962
# Test show method for @sel_str
950963
buff = IOBuffer()
951964
show(buff, MIME"text/plain"(), sel"name CA and resnum 1")
952965
@test String(take!(buff)) == """Select("name CA and resnum 1")"""
966+
967+
# Syntax errors
968+
@test_throws ArgumentError collectatoms(struc, BioStructures.Select("abc")) # Invalid selection syntax
969+
@test_throws ArgumentError collectatoms(struc, BioStructures.Select("index = A")) # Invalid value type
970+
@test_throws ArgumentError collectatoms(struc, BioStructures.Select("resnum C"))
971+
@test_throws ArgumentError collectatoms(struc, sel"name CA and (residue 1")
972+
@test_throws ArgumentError collectatoms(struc, sel"name CA and (residue 1))")
973+
@test_throws ArgumentError collectatoms(struc, sel"index <")
974+
@test_throws ArgumentError collectatoms(struc, sel"index < 1.0")
975+
@test_throws ArgumentError collectatoms(struc, sel"indes 1")
976+
@test_throws ArgumentError collectatoms(struc, sel"element")
977+
@test_throws ArgumentError collectatoms(struc, sel"index 1 element")
978+
@test_throws ArgumentError collectatoms(struc, sel"protein 1")
979+
@test_throws ArgumentError collectatoms(struc, sel"protein = 1")
980+
@test_throws ArgumentError collectatoms(struc, sel"residue 1 < 5")
981+
@test_throws ArgumentError collectatoms(struc, sel"residue A")
982+
@test_throws ArgumentError collectatoms(struc, sel"residue 1 and ()")
983+
@test_throws ArgumentError collectatoms(struc, sel"not (protein) and ()")
984+
@test_throws ArgumentError collectatoms(struc, sel"residue 1 or")
985+
@test_throws ArgumentError collectatoms(struc, sel"residue 1 and")
986+
@test_throws ArgumentError collectatoms(struc, sel"residue 1 not")
987+
@test_throws ArgumentError collectatoms(struc, sel"residue")
988+
@test_throws ArgumentError collectatoms(struc, sel"element")
989+
@test_throws ArgumentError collectatoms(struc, sel"not")
990+
991+
953992
end
954993

955994
@testset "PDB reading" begin

0 commit comments

Comments
 (0)