Skip to content

Commit ca3aabd

Browse files
committed
implement better selection syntax
1 parent b3d4fd0 commit ca3aabd

1 file changed

Lines changed: 212 additions & 26 deletions

File tree

src/select.jl

Lines changed: 212 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,9 @@ const keywords = [
545545
Keyword(Float64, "x" , x , operators),
546546
Keyword(Float64, "y" , y , operators),
547547
Keyword(Float64, "z" , z , operators),
548+
]
549+
550+
const macro_keywords = [
548551
MacroKeyword("standard" , standardselector),
549552
MacroKeyword("hetero" , heteroselector),
550553
MacroKeyword("backbone" , backboneselector),
@@ -569,37 +572,220 @@ const keywords = [
569572
MacroKeyword("all" , allselector),
570573
]
571574

572-
# See https://discourse.julialang.org/t/parsing-selection-syntax/43632/9
573-
parse_query(selection::AbstractString) = parse_query_vector(split(selection))
574-
575-
function parse_query_vector(s::AbstractVector{<:AbstractString})
576-
# or, and, not
577-
if (i = findfirst(==("or"), s)) !== nothing
578-
deleteat!(s, i)
579-
return (|, parse_query_vector.((s[1:i-1], s[i:end]))...)
580-
elseif (i = findfirst(==("and"), s)) !== nothing
581-
deleteat!(s, i)
582-
return (&, parse_query_vector.((s[1:i-1], s[i:end]))...)
583-
elseif (i = findfirst(==("not"), s)) !== nothing
584-
deleteat!(s, i)
585-
return (!, parse_query_vector(s[i:end]))
586-
# Keywords
587-
else
588-
for key in keywords
589-
if (i = findfirst(==(key.name), s)) !== nothing
590-
deleteat!(s, i)
591-
return key(s)
592-
end
593-
end
594-
throw(ArgumentError(("Unable to parse selection string: $s")))
595-
end
575+
#=
576+
parse_query(selection:String)
577+
578+
Calls `parse_query_vector` after splitting the selection string.
579+
580+
=#
581+
function parse_query(selection::String)
582+
s = replace(selection, "(" => " ( ", ")" => " ) ")
583+
return parse_query_vector(split(s))
596584
end
597585

598586
function apply_query(q, a)
599587
if !(q isa Tuple)
600-
return q(a)
588+
q(a)
601589
else
602590
f, args = Iterators.peel(q)
603-
return f(apply_query.(args, Ref(a))...)
591+
f(apply_query.(args, Ref(a))...)
592+
end
593+
end
594+
595+
parse_error(str) = throw(ArgumentError(str))
596+
597+
#
598+
# Obs: the following code were generated by Gemini 2.5-Pro, with modifications,
599+
# and then tested.
600+
#
601+
602+
# New helper functions
603+
function is_operator(token::AbstractString)
604+
return token == "and" || token == "or" || token == "not"
605+
end
606+
607+
function is_fully_enclosed(tokens::AbstractVector{<:AbstractString})
608+
level = 0
609+
# Check if the first '(' matches the last ')' without level becoming zero in between
610+
# for any token except the last one.
611+
for i in firstindex(tokens):(lastindex(tokens)-1)
612+
if tokens[i] == "("
613+
level += 1
614+
elseif tokens[i] == ")"
615+
level -= 1
616+
if level == 0 # Closed too early, means not fully enclosed by the outermost pair
617+
return false
618+
end
619+
end
620+
end
621+
# After iterating up to tokens[end-1], level should be 1 if tokens[begin] was '('
622+
# and it correctly matches tokens[end]. If level is not 1, it means mismatched parentheses within.
623+
return level == 1
624+
end
625+
626+
function find_operator_at_level_zero(op_str::String, tokens::AbstractVector{<:AbstractString})
627+
level = 0
628+
# Find first occurrence from left to right (maintaining current style)
629+
for i in eachindex(tokens)
630+
if tokens[i] == "("
631+
level += 1
632+
elseif tokens[i] == ")"
633+
level -= 1
634+
if level < 0
635+
parse_error("Mismatched parentheses: too many closing parentheses.")
636+
end
637+
elseif tokens[i] == op_str && level == 0
638+
return i
639+
end
640+
end
641+
if level != 0
642+
parse_error("Mismatched parentheses: not enough closing parentheses.")
604643
end
644+
return 0 # Not found at level zero
605645
end
646+
647+
# Modified parse_query_vector
648+
function parse_query_vector(s_vec_const::AbstractVector{<:AbstractString})
649+
s_vec = s_vec_const # Operate on slices or copies, not modifying original array passed around
650+
651+
if isempty(s_vec)
652+
parse_error("Empty query segment.")
653+
end
654+
655+
# Handle expressions fully enclosed in matching parentheses
656+
# e.g. "(A and B)" should be parsed by parsing "A and B"
657+
temp_s_vec = s_vec # Use a temporary variable for iterative stripping
658+
while length(temp_s_vec) > 1 && temp_s_vec[begin] == "(" && temp_s_vec[end] == ")" && is_fully_enclosed(temp_s_vec)
659+
temp_s_vec = temp_s_vec[begin+1:end-1]
660+
if isempty(temp_s_vec)
661+
parse_error("Empty parentheses in query: '()'")
662+
end
663+
end
664+
s_vec = temp_s_vec # Assign the stripped version back
665+
666+
# Operator precedence: OR, then AND, then NOT (as in original code for splitting)
667+
# Find 'or' not within parentheses
668+
if (i = find_operator_at_level_zero("or", s_vec)) > 0
669+
left_tokens = s_vec[begin:i-1]
670+
right_tokens = s_vec[i+1:end]
671+
if isempty(left_tokens) || isempty(right_tokens)
672+
parse_error("Syntax error near 'or'. Missing operand.")
673+
end
674+
return (|, parse_query_vector(left_tokens), parse_query_vector(right_tokens))
675+
676+
elseif (i = find_operator_at_level_zero("and", s_vec)) > 0
677+
left_tokens = s_vec[begin:i-1]
678+
right_tokens = s_vec[i+1:end]
679+
if isempty(left_tokens) || isempty(right_tokens)
680+
parse_error("Syntax error near 'and'. Missing operand.")
681+
end
682+
return (&, parse_query_vector(left_tokens), parse_query_vector(right_tokens))
683+
684+
elseif s_vec[begin] == "not"
685+
if length(s_vec) == 1
686+
parse_error("Syntax error near 'not'. Missing operand.")
687+
end
688+
remaining_tokens = s_vec[begin+1:end]
689+
if isempty(remaining_tokens) # Should be caught by length check, but defensive
690+
parse_error("Syntax error near 'not'. Missing operand.")
691+
end
692+
# Prevent "not and", "not or", "not not" if "not" is not a general prefix operator in this DSL
693+
if is_operator(remaining_tokens[begin]) && remaining_tokens[begin] != "not" # allow "not not" if desired, though unusual
694+
parse_error("Operator '$(remaining_tokens[begin])' cannot directly follow 'not'.")
695+
end
696+
return (!, parse_query_vector(remaining_tokens))
697+
698+
# Base case: No top-level logical operators. Must be a keyword phrase.
699+
else
700+
#if isempty(s_vec) # Should not happen if initial checks are correct
701+
# parse_error("Unexpected empty query segment.")
702+
#end
703+
token_keyword_name = s_vec[begin]
704+
705+
# Standard Keywords (e.g., "name", "resnum", "index")
706+
for key_obj in keywords # key_obj is of type Keyword
707+
if token_keyword_name == key_obj.name
708+
if length(s_vec) == 1 # Keyword name token only, no arguments
709+
parse_error("Keyword '$(key_obj.name)' requires at least one argument.")
710+
end
711+
712+
keyword_args = s_vec[begin+1:end] # Arguments following the keyword name
713+
714+
is_operator_syntax_match = false
715+
if !isempty(keyword_args)
716+
first_arg = keyword_args[1]
717+
for op_tuple in key_obj.operators # e.g., ("<", isless)
718+
operator_string = op_tuple[1]
719+
if first_arg == operator_string
720+
# Expected form: "keyword operator value", so keyword_args should be ["operator", "value"] (length 2)
721+
if length(keyword_args) == 2
722+
is_operator_syntax_match = true
723+
else
724+
parse_error(
725+
"Malformed operator expression for keyword '$(key_obj.name)'. "*
726+
"Expected 'keyword $operator_string value'. Got: $(join(s_vec, " "))"
727+
)
728+
end
729+
break # Operator string found and processed
730+
end
731+
end
732+
end
733+
734+
if is_operator_syntax_match
735+
# Case: "keyword operator value", e.g., "resnum < 13"
736+
# keyword_args will be ["<", "13"]. The Keyword functor handles this structure.
737+
return key_obj(keyword_args)
738+
else
739+
# Case: Not a recognized "keyword operator value" structure.
740+
# This implies implicit equality: "keyword value" or "keyword value1 value2 ..." (for OR expansion).
741+
742+
#if isempty(keyword_args) # Should have been caught by length(s_vec) == 1
743+
# parse_error("No arguments provided for keyword '$(key_obj.name)'.") # Should be unreachable
744+
#end
745+
746+
# Sanity check for multi-value: ensure no operators are present in the value list.
747+
# E.g. "resnum 10 < 20" is an error here because "10" is not an operator,
748+
# but "<" appears later in a context expecting only values.
749+
for arg_val in keyword_args
750+
for op_tuple in key_obj.operators
751+
if arg_val == op_tuple[1] # op_tuple[1] is the operator string
752+
parse_error(
753+
"Syntax error for keyword '$(key_obj.name)'. Operator '$(op_tuple[1])' found in an unexpected position. "*
754+
"Arguments: $(join(keyword_args, " ")). Operator expressions must be 'keyword $(op_tuple[1]) value'."
755+
)
756+
end
757+
end
758+
end
759+
760+
# Proceed with implicit equality (single value or multi-value OR).
761+
if length(keyword_args) == 1
762+
# e.g., "name CA" -> keyword_args = ["CA"]
763+
# The Keyword functor handles this as implicit equality.
764+
return key_obj(keyword_args)
765+
else
766+
# Multi-value implicit OR case, e.g., "resname ARG GLU ASP"
767+
# keyword_args = ["ARG", "GLU", "ASP"]
768+
current_expr_tree = key_obj([keyword_args[end]]) # Process the last value
769+
for k_idx in (length(keyword_args)-1):-1:firstindex(keyword_args) # Iterate remaining values
770+
current_expr_tree = (|, key_obj([keyword_args[k_idx]]), current_expr_tree)
771+
end
772+
return current_expr_tree
773+
end
774+
end
775+
end
776+
end
777+
778+
# Macro Keywords (e.g., "protein", "water")
779+
for key_obj in macro_keywords
780+
if token_keyword_name == key_obj.name
781+
if length(s_vec) > 1
782+
parse_error("Macro keyword '$(key_obj.name)' does not take arguments. Unexpected tokens: $(join(s_vec[begin+1:end], " "))")
783+
end
784+
# MacroKeyword functor expects an argument list (empty for macros)
785+
return key_obj(String[])
786+
end
787+
end
788+
789+
parse_error("Unknown keyword or invalid syntax at: '$(join(s_vec, " "))'")
790+
end
791+
end

0 commit comments

Comments
 (0)