Skip to content

Commit a99a9b4

Browse files
committed
Change so search find a single occurence in all indices per sequence
1 parent 36c95fc commit a99a9b4

1 file changed

Lines changed: 30 additions & 25 deletions

File tree

newmap/search.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,9 @@ def binary_search(config: SearchConfig,
457457

458458
iteration_count = 1
459459

460+
# Number of sequences being searched on
461+
num_sequences = len(sequence_segments)
462+
460463
# While there are still kmers to search
461464
while not np.all(finished_search):
462465
config.log("Iteration {}".format(iteration_count))
@@ -482,9 +485,9 @@ def binary_search(config: SearchConfig,
482485
"Number of counted positions ({}) and number of counts ({}) " \
483486
"do not match".format(len(kmer_indices), len(count_list))
484487

485-
# Where we have counts of 1
488+
# Where we have counts of 1 per sequence searched over
486489
unique_lengths[kmer_indices] = np.where(
487-
(count_list == 1) &
490+
(count_list == num_sequences) &
488491
# And if there is no current unique length recorded
489492
((unique_lengths[kmer_indices] == 0) |
490493
# Or there is a smaller length found than the current min length
@@ -495,20 +498,20 @@ def binary_search(config: SearchConfig,
495498
# Otherwise keep the current unique length
496499
unique_lengths[kmer_indices])
497500

498-
# If we have a k-mer count of 1 and the current length queried is the
499-
# same as the lower bound (i.e. can't get smaller for a unique count)
500-
# This position has finished searching
501+
# If we have a k-mer count of 1 per sequence and the current length
502+
# queried is the same as the lower bound (i.e. can't get smaller for a
503+
# unique count) This position has finished searching
501504
finished_search[kmer_indices] = np.where(
502-
count_list == 1,
505+
count_list == num_sequences,
503506
current_length_query[kmer_indices] ==
504507
lower_length_bound[kmer_indices],
505508
finished_search[kmer_indices])
506509

507-
# If we have a k-mer count > 1 and the current length queried is the
508-
# same as the uppper bound (i.e. can't find a unique length or larger)
509-
# This position has finished searching
510+
# If we have a k-mer count > number of sequences and the current length
511+
# queried is the same as the uppper bound (i.e. can't find a unique
512+
# length or larger) This position has finished searching
510513
finished_search[kmer_indices] = np.where(
511-
count_list > 1,
514+
count_list > num_sequences,
512515
current_length_query[kmer_indices] ==
513516
upper_length_bound[kmer_indices],
514517
finished_search[kmer_indices])
@@ -519,15 +522,15 @@ def binary_search(config: SearchConfig,
519522
# we need to decrease our k-mer length (i.e. counts == 1)
520523
# Set the new upper (inclusive) bound to the current query length - 1
521524
upper_length_bound[kmer_indices] = np.where(
522-
count_list == 1,
525+
count_list == num_sequences,
523526
current_length_query[kmer_indices] - 1,
524527
upper_length_bound[kmer_indices])
525528

526529
# Raise the lower bounds of our search range on positions where
527530
# we need to increase our k-mer length (i.e. counts > 1)
528531
# Set the new lower (inclusive) bound to the current query length + 1
529532
lower_length_bound[kmer_indices] = np.where(
530-
count_list > 1,
533+
count_list > num_sequences,
531534
current_length_query[kmer_indices] + 1,
532535
lower_length_bound[kmer_indices])
533536

@@ -569,6 +572,8 @@ def linear_search(config: SearchConfig,
569572
# List of minimum lengths (where 0 is nothing was found)
570573
unique_lengths = np.zeros(num_kmers, dtype=np.uint32)
571574

575+
num_sequences = len(sequence_segments)
576+
572577
# For each kmer length
573578
for kmer_length in config.kmer_lengths:
574579
config.log("{} k-mers remaining".format((~finished_search).sum()))
@@ -620,8 +625,9 @@ def linear_search(config: SearchConfig,
620625
"do not match".format(len(kmer_indices), len(count_list))
621626

622627
unique_lengths[kmer_indices] = np.where(
623-
# Where the count is 1
624-
(count_list == 1) &
628+
# Where the count is equal to the number of sequences to search on
629+
# (i.e. a unique count per sequence specified)
630+
(count_list == num_sequences) &
625631
# And if there is no current unique length recorded
626632
(unique_lengths[kmer_indices] == 0),
627633
# Record the minimum kmer length found if it less than the current
@@ -644,20 +650,19 @@ def get_kmer_counts(config: SearchConfig,
644650
kmer_lengths: list[int]) -> npt.NDArray[np.uint32]:
645651

646652
count_list = np.zeros(len(sequence_indices), dtype=np.uint32)
647-
# For each index (usually just one)
648-
# NB: We assume a guarantee that we have > 0 indexes at this point
649-
for fmindex_filepath in config.fmindex_filepaths:
650653

651-
if len(config.fmindex_filepaths) > 1:
652-
config.log(f"Counting kmers from index: {fmindex_filepath}")
653-
654-
# For every sequence segment in the list of sequence segments for this
655-
# index (usually just one)
654+
# For every sequence segment in the list of sequence segments for this
655+
# index (usually just one)
656+
for fmindex_filepath in config.fmindex_filepaths:
657+
# For each index (usually just one)
658+
# NB: We assume a guarantee that we have > 0 indexes at this point
656659
for i, sequence in enumerate(sequences):
657660

658-
if len(sequences) > 1:
659-
# TODO: Print out sequence ID?
660-
config.log(f"Counting sequence #{i+1}")
661+
if (len(sequences) or
662+
len(config.fmindex_filepaths)) > 1:
663+
config.log(f"Counting k-mers from "
664+
f"{config.fmindex_filepaths[i]} in index "
665+
f"{fmindex_filepath}")
661666

662667
# Count the occurences of kmers on the forward strand
663668
count_list += np.array(count_kmers_from_sequence(

0 commit comments

Comments
 (0)