Update tests with new search configuration

EricR86 · EricR86 · commit 9ebf88200de8 · 2026-03-24T13:21:32.000-04:00
diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml
@@ -71,7 +71,7 @@ jobs:
           # Make sure to be running newmap from installed site-packages
           cd tests
           python -m unittest discover
-          ./run_all.sh
+          ./test_end_to_end.sh
       - uses: actions/upload-artifact@v4
         if: ${{ failure() }}  # Only on failure attempt to upload core dump
         with:
diff --git a/newmap/search.py b/newmap/search.py
@@ -1,6 +1,6 @@
 from contextlib import ExitStack
 from collections.abc import Callable
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import partial
 from math import ceil, log2
 from pathlib import Path
@@ -31,6 +31,9 @@ def nil_search_log(*_: str):
 
 @dataclass(frozen=True)
 class SearchConfig:
+    # NB: All defaults here are not guaranteed to match the command line
+    # interface defaults, they are for python interface convience only
+
     # Position args
     fasta_filepaths: list[Path]
     fmindex_filepaths: list[Path]
@@ -39,22 +42,25 @@ class SearchConfig:
     kmer_lengths: list[int]
     is_binary_search: bool
 
-    use_reverse_complement: bool
-    output_directory: Path
+    use_reverse_complement: bool = True
+    output_directory: Path = Path.cwd()
 
-    include_sequence_ids: list[bytes]
-    exclude_sequence_ids: list[bytes]
+    # NB: Cannot assign mutable default arguments since they would be shared
+    # between instances so we must use default_factory to create a new list
+    # for each instance when using a frozen dataclass
+    include_sequence_ids: list[bytes] = field(default_factory=list)
+    exclude_sequence_ids: list[bytes] = field(default_factory=list)
 
     # Performance arguments
-    num_threads: int
-    kmer_batch_size: int
-    initial_search_length: int
+    num_threads: int = 1
+    kmer_batch_size: int = 1000000
+    initial_search_length: int = 0  # NB: default of none
 
     # Verbosity
     # NB: Only used when additional calculations are needed for logging
-    verbose: bool
-    # Logging function
-    log: Callable[[str], None]
+    verbose: bool = False
+    # Logging function (initialzed based on verbosity)
+    log: Callable[[str], None] = field(init=False)
 
     @classmethod
     def from_args(cls, args):
@@ -154,11 +160,6 @@ def from_args(cls, args):
                 [s.encode() for s in
                  exclude_sequences_arg.split(SEQUENCE_ID_SEPARATOR)]
 
-        if args.verbose:
-            logging_function = partial(verbose_print, True)
-        else:
-            logging_function = nil_search_log
-
         return cls(
                    # Position args
                    fasta_filepaths=fasta_filenames,
@@ -174,7 +175,6 @@ def from_args(cls, args):
                    include_sequence_ids=include_sequence_ids,
                    exclude_sequence_ids=exclude_sequence_ids,
 
-                   log=logging_function,
                    verbose=args.verbose,
 
                    # Performance arguments
@@ -183,6 +183,16 @@ def from_args(cls, args):
                    initial_search_length=initial_search_length
         )
 
+    def __post_init__(self):
+        # Setup logging based on verbosity
+        if self.verbose:
+            logging_function = partial(verbose_print, True)
+        else:
+            logging_function = nil_search_log
+
+        # Set attribute on the object since the dataclass is frozen
+        object.__setattr__(self, "log", logging_function)
+
 
 def write_unique_counts(config: SearchConfig):
 
diff --git a/tests/test_end_to_end.sh b/tests/test_end_to_end.sh
diff --git a/tests/test_unique_counts.py b/tests/test_unique_counts.py
@@ -9,7 +9,7 @@
 from newmap.main import (DEFAULT_COMPRESSION_RATIO,
                          DEFAULT_SEED_LENGTH)
 from newmap.index import generate_fm_index
-from newmap.search import write_unique_counts
+from newmap.search import SearchConfig, write_unique_counts
 
 # Expected minimum unique lengths at each position
 # NB: In order to manually count correctly, it is important to remember to
@@ -46,16 +46,15 @@ def test_linear_search(self):
         self.search(use_binary_search=False)
 
     def search(self, use_binary_search):
-        write_unique_counts(Path(self.fasta_filename),
-                            Path(self.genome_index_filename),
-                            15,  # Batch size
-                            list(range(4, 11)),  # Kmer lengths 4 to 10
-                            0,  # Initial search length
-                            [],  # Include chr ids
-                            [],  # Exclude chr ids
-                            False,  # no reverse complement
-                            self.num_threads,
-                            use_binary_search)
+
+        write_unique_counts(SearchConfig(
+            fasta_filepaths=[Path(self.fasta_filename)],
+            fmindex_filepaths=[Path(self.genome_index_filename)],
+            kmer_lengths=list(range(4, 11)),
+            kmer_batch_size=15,
+            is_binary_search=use_binary_search,
+            num_threads=self.num_threads,
+        ))
 
         # Check the results in chr1.unique.uint8 and chr2.unique.uint8
         chr1_results = np.fromfile('chr1.unique.uint8', dtype=np.uint8)
diff --git a/tests/test_unique_to_mappability.py b/tests/test_unique_to_mappability.py
@@ -7,7 +7,7 @@
 from newmap.main import (DEFAULT_COMPRESSION_RATIO,
                          DEFAULT_SEED_LENGTH)
 from newmap.index import generate_fm_index
-from newmap.search import write_unique_counts
+from newmap.search import SearchConfig, write_unique_counts
 from newmap.track import write_mappability_files
 
 
@@ -24,16 +24,14 @@ def setUpClass(cls):
                           DEFAULT_COMPRESSION_RATIO,
                           DEFAULT_SEED_LENGTH)
 
-        write_unique_counts(Path(cls.fasta_filename),
-                            Path(cls.genome_index_filename),
-                            15,  # Batch size
-                            list(range(4, 11)),  # Kmer lengths 4 to 10
-                            0,  # Initial search length
-                            [],  # Include chr ids
-                            [],  # Exclude chr ids
-                            False,  # no reverse complement
-                            cls.num_threads,
-                            use_binary_search=True)
+        write_unique_counts(SearchConfig(
+            fasta_filepaths=[Path(cls.fasta_filename)],
+            fmindex_filepaths=[Path(cls.genome_index_filename)],
+            kmer_lengths=list(range(4, 11)),
+            kmer_batch_size=15,
+            is_binary_search=True,
+            num_threads=cls.num_threads,
+        ))
 
     @classmethod
     def tearDownClass(cls):