Skip to content

Commit 2cbee57

Browse files
committed
Add option to skip counting on the reverse-complement strand
1 parent 491b69c commit 2cbee57

2 files changed

Lines changed: 35 additions & 18 deletions

File tree

newmap/main.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ def parse_subcommands():
141141
"Cannot be used with --include-sequences. "
142142
f"(default: all sequences in {FASTA_FILE_METAVAR})")
143143

144+
unique_length_output_parameter_group.add_argument(
145+
"--norc",
146+
action="store_true",
147+
help="If specified, newmap will not search on the reverse-complement "
148+
"reference strand.")
149+
144150
unique_length_output_parameter_group.add_argument(
145151
"--verbose", "-v",
146152
action="store_true",

newmap/search.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def write_unique_counts(fasta_filename: Path,
2525
initial_search_length: int,
2626
include_sequence_ids: list[bytes],
2727
exclude_sequence_ids: list[bytes],
28+
skip_reverse_complement: bool,
2829
num_threads: int,
2930
use_binary_search=False,
3031
output_directory: Path = Path("."),
@@ -136,6 +137,7 @@ def write_unique_counts(fasta_filename: Path,
136137
min_kmer_length,
137138
max_kmer_length,
138139
initial_search_length,
140+
skip_reverse_complement,
139141
num_threads,
140142
data_type, # type: ignore
141143
verbose)
@@ -145,6 +147,7 @@ def write_unique_counts(fasta_filename: Path,
145147
sequence_segment,
146148
kmer_lengths,
147149
num_kmers,
150+
skip_reverse_complement,
148151
num_threads,
149152
data_type, # type: ignore
150153
verbose)
@@ -207,6 +210,7 @@ def binary_search(index_filename: Path,
207210
min_kmer_length: int,
208211
max_kmer_length: int,
209212
initial_search_length: int,
213+
skip_reverse_complement: bool,
210214
num_threads: int,
211215
data_type: Union[np.uint8, np.uint16, np.uint32],
212216
verbose: bool) -> tuple[npt.NDArray[np.uint], int]:
@@ -292,6 +296,7 @@ def binary_search(index_filename: Path,
292296
kmer_indices.tolist(),
293297
current_length_query[
294298
kmer_indices].tolist(),
299+
skip_reverse_complement,
295300
num_threads)
296301

297302
# Assert that the number of indices to count and the number of counts
@@ -368,6 +373,7 @@ def linear_search(index_filename: Path,
368373
sequence_segment: SequenceSegment,
369374
kmer_lengths: list[int],
370375
num_kmers: int,
376+
skip_reverse_complement,
371377
num_threads: int,
372378
data_type: Union[np.uint8, np.uint16, np.uint32],
373379
verbose: bool) -> tuple[npt.NDArray[np.uint], int]:
@@ -429,6 +435,7 @@ def linear_search(index_filename: Path,
429435
sequence_segment.data,
430436
kmer_indices.tolist(),
431437
max_kmer_query_lengths,
438+
skip_reverse_complement,
432439
num_threads)
433440

434441
# Assert that the number of indices to count and the number of counts
@@ -460,6 +467,7 @@ def get_kmer_counts(index_filename: Path,
460467
sequence_data: bytes,
461468
index_list: list[int],
462469
kmer_lengths: list[int],
470+
skip_reverse_complement: bool,
463471
num_threads: int) -> npt.NDArray[np.uint32]:
464472

465473
# Count the occurences of kmers on the forward strand
@@ -471,24 +479,25 @@ def get_kmer_counts(index_filename: Path,
471479
num_threads),
472480
dtype=np.uint32)
473481

474-
# Count the occurrences of kmers on the reverse complement strand
475-
# TODO: Add no reverse complement option to skip this to
476-
# support bisulfite treated kmer counting
477-
# TODO: Add option for a complement table
478-
reverse_complement_sequence = \
479-
sequence_data.translate(COMPLEMENT_TRANSLATE_TABLE)[::-1]
480-
481-
sequence_data_length = len(sequence_data)
482-
reverse_index_list = [sequence_data_length - i - kmer_length
483-
for i, kmer_length in zip(index_list, kmer_lengths)]
484-
485-
count_list += np.array(count_kmers_from_sequence(
486-
str(index_filename),
487-
reverse_complement_sequence,
488-
reverse_index_list,
489-
kmer_lengths,
490-
num_threads),
491-
dtype=np.uint32)
482+
# If we are not skipping the reverse complement strand (default)
483+
if not skip_reverse_complement:
484+
# Count the occurrences of kmers on the reverse complement strand
485+
# TODO: Add option for a complement table
486+
reverse_complement_sequence = \
487+
sequence_data.translate(COMPLEMENT_TRANSLATE_TABLE)[::-1]
488+
489+
sequence_data_length = len(sequence_data)
490+
reverse_index_list = [sequence_data_length - i - kmer_length
491+
for i, kmer_length in zip(index_list,
492+
kmer_lengths)]
493+
494+
count_list += np.array(count_kmers_from_sequence(
495+
str(index_filename),
496+
reverse_complement_sequence,
497+
reverse_index_list,
498+
kmer_lengths,
499+
num_threads),
500+
dtype=np.uint32)
492501

493502
# If any element in the count list is 0
494503
if np.any(count_list == 0):
@@ -698,6 +707,7 @@ def main(args):
698707
initial_search_length = args.initial_search_length
699708
include_sequences_arg = args.include_sequences
700709
exclude_sequences_arg = args.exclude_sequences
710+
skip_reverse_complement = args.norc
701711
num_threads = args.num_threads
702712
verbose = args.verbose
703713

@@ -773,6 +783,7 @@ def main(args):
773783
initial_search_length,
774784
include_sequence_ids,
775785
exclude_sequence_ids,
786+
skip_reverse_complement,
776787
num_threads,
777788
use_binary_search,
778789
output_directory,

0 commit comments

Comments
 (0)