@@ -442,22 +442,37 @@ void cache_payloads(
442442
443443 const handlegraph::HandleGraph* graph_ptr = (const handlegraph::HandleGraph*) &gbz.graph ;
444444
445+ double total_zipcode_time = 0.0 , total_decoder_time = 0.0 ;
446+ std::atomic<uint64_t > node_count = 0 ;
445447 gbz.graph .for_each_handle ([&](const handle_t & handle) {
446448 nid_t node_id = gbz.graph .get_id (handle);
447- ZipCode zipcode;
448449 pos_t pos = make_pos_t (node_id, false , 0 );
449- zipcode.fill_in_zipcode_from_pos (distance_index, pos, true , graph_ptr);
450+ ZipCode zipcode;
451+ zipcode.fill_in_zipcode_from_pos (distance_index, pos, false , graph_ptr);
452+ zipcode.fill_in_full_decoder ();
453+ if (++node_count % 10000 == 0 && progress) {
454+ double telapsed = gbwt::readTimer () - start;
455+ #pragma omp critical (cerr)
456+ std::cerr << " Cached " << node_count << " nodes in " << telapsed << " s" << std::endl;
457+ }
458+
450459 payload_t payload = zipcode.get_payload_from_zip ();
451460 if (payload == MIPayload::NO_CODE && oversized_zipcodes != nullptr ) {
452461 // The zipcode is too large for the payload field.
453462 // Add it to the oversized zipcode list.
454- zipcode.fill_in_full_decoder ();
455- size_t offset = oversized_zipcodes->size ();
456- oversized_zipcodes->emplace_back (zipcode);
463+ size_t offset;
464+ #pragma omp critical (cache_payloads_zipcodes)
465+ {
466+ offset = oversized_zipcodes->size ();
467+ oversized_zipcodes->emplace_back (zipcode);
468+ }
457469 payload = { 0 , offset };
458470 }
459- node_id_to_payload.emplace (node_id, payload);
460- });
471+ #pragma omp critical (cache_payloads_map)
472+ {
473+ node_id_to_payload.emplace (node_id, payload);
474+ }
475+ }, true );
461476
462477 if (progress) {
463478 double seconds = gbwt::readTimer () - start;
@@ -521,6 +536,15 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index(
521536 // A zipcode only depends on the node id.
522537 vg::hash_map<nid_t , payload_t > node_id_to_payload;
523538 node_id_to_payload.reserve (gbz.graph .max_node_id () - gbz.graph .min_node_id ());
539+ // Preload the distance index right before we use it.
540+ // find_frequent_kmers uses a lot of memory/IO scanning the whole graph
541+ // and might evict the mmap'd index pages from the OS page cache,
542+ // causing cache_payloads to page-fault on every node. So we preload
543+ // after kmer counting to ensure the index is warm.
544+ if (params.progress ) {
545+ std::cerr << " Preloading distance index" ;
546+ }
547+ distance_index->preload (true );
524548 cache_payloads (gbz, *distance_index, node_id_to_payload, oversized_zipcodes, params.progress );
525549
526550 auto get_payload = [&](const pos_t & pos) -> const code_type* {
0 commit comments