Skip to content

Commit 9b48cfe

Browse files
authored
Merge pull request #4860 from vgteam/parallel-payload-caching
Parallel payload caching
2 parents 85b2f44 + 3c27df0 commit 9b48cfe

6 files changed

Lines changed: 52 additions & 26 deletions

File tree

src/gbwtgraph_helper.cpp

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -442,22 +442,37 @@ void cache_payloads(
442442

443443
const handlegraph::HandleGraph* graph_ptr = (const handlegraph::HandleGraph*) &gbz.graph;
444444

445+
double total_zipcode_time = 0.0, total_decoder_time = 0.0;
446+
std::atomic<uint64_t> node_count = 0;
445447
gbz.graph.for_each_handle([&](const handle_t& handle) {
446448
nid_t node_id = gbz.graph.get_id(handle);
447-
ZipCode zipcode;
448449
pos_t pos = make_pos_t(node_id, false, 0);
449-
zipcode.fill_in_zipcode_from_pos(distance_index, pos, true, graph_ptr);
450+
ZipCode zipcode;
451+
zipcode.fill_in_zipcode_from_pos(distance_index, pos, false, graph_ptr);
452+
zipcode.fill_in_full_decoder();
453+
if (++node_count % 10000 == 0 && progress) {
454+
double telapsed = gbwt::readTimer() - start;
455+
#pragma omp critical (cerr)
456+
std::cerr << " Cached " << node_count << " nodes in " << telapsed << "s" << std::endl;
457+
}
458+
450459
payload_t payload = zipcode.get_payload_from_zip();
451460
if (payload == MIPayload::NO_CODE && oversized_zipcodes != nullptr) {
452461
// The zipcode is too large for the payload field.
453462
// Add it to the oversized zipcode list.
454-
zipcode.fill_in_full_decoder();
455-
size_t offset = oversized_zipcodes->size();
456-
oversized_zipcodes->emplace_back(zipcode);
463+
size_t offset;
464+
#pragma omp critical (cache_payloads_zipcodes)
465+
{
466+
offset = oversized_zipcodes->size();
467+
oversized_zipcodes->emplace_back(zipcode);
468+
}
457469
payload = { 0, offset };
458470
}
459-
node_id_to_payload.emplace(node_id, payload);
460-
});
471+
#pragma omp critical (cache_payloads_map)
472+
{
473+
node_id_to_payload.emplace(node_id, payload);
474+
}
475+
}, true);
461476

462477
if (progress) {
463478
double seconds = gbwt::readTimer() - start;
@@ -521,6 +536,15 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index(
521536
// A zipcode only depends on the node id.
522537
vg::hash_map<nid_t, payload_t> node_id_to_payload;
523538
node_id_to_payload.reserve(gbz.graph.max_node_id() - gbz.graph.min_node_id());
539+
// Preload the distance index right before we use it.
540+
// find_frequent_kmers uses a lot of memory/IO scanning the whole graph
541+
// and might evict the mmap'd index pages from the OS page cache,
542+
// causing cache_payloads to page-fault on every node. So we preload
543+
// after kmer counting to ensure the index is warm.
544+
if (params.progress) {
545+
std::cerr << "Preloading distance index";
546+
}
547+
distance_index->preload(true);
524548
cache_payloads(gbz, *distance_index, node_id_to_payload, oversized_zipcodes, params.progress);
525549

526550
auto get_payload = [&](const pos_t& pos) -> const code_type* {

src/snarl_distance_index.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -814,10 +814,12 @@ static void populate_hub_labeling(SnarlDistanceIndex::TemporaryDistanceIndex& te
814814
/**
815815
* Determine if a snarl is regular or not.
816816
*
817-
* A regular snarl is a snarl that, while not simple, consists of only nodes or
817+
* A regular snarl is a snarl that consists of only nodes or
818818
* chains connected to the start and end, without any connections between
819819
* multiple children, or any way to turn around. There may be an edge directly
820820
* across.
821+
*
822+
* A simple snarl is always regular.
821823
*/
822824
static bool check_regularity(const SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector<SnarlDistanceIndex::temp_record_ref_t>& all_children, const HandleGraph* graph);
823825

@@ -1661,11 +1663,11 @@ bool check_regularity(const SnarlDistanceIndex::TemporaryDistanceIndex& temp_ind
16611663
return false;
16621664
}
16631665
if (temp_snarl_record.is_simple) {
1664-
// Simple snarls can't be regular because simple is more specific and useful.
1666+
// Simple snarls are always also regular.
16651667
#ifdef debug_distance_indexing
1666-
std::cerr << "Snarl is not regular because it is simple." << std::endl;
1668+
std::cerr << "Snarl is regular because it is simple." << std::endl;
16671669
#endif
1668-
return false;
1670+
return true;
16691671
}
16701672

16711673
// Get the snarl boundary nodes, facing out

src/subcommand/minimizer_main.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,11 @@ int main_minimizer(int argc, char** argv) {
8888
if (!config.distance_name.empty()) {
8989
// new distance index
9090
if (config.progress) {
91-
logger.info() << "Loading SnarlDistanceIndex from " << config.distance_name << std::endl;
91+
logger.info() << "Opening SnarlDistanceIndex at " << config.distance_name << std::endl;
9292
}
9393
distance_index = vg::io::VPKG::load_one<SnarlDistanceIndex>(config.distance_name);
94-
distance_index->preload(true);
94+
// Note that we don't fault in the index until we're actually about to
95+
// use it, or it might get paged out again.
9596
}
9697

9798
ZipCodeCollection oversized_zipcodes;

src/unittest/zip_code.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ using namespace std;
117117
bool chain_is_reversed = distance_index.is_reversed_in_parent(
118118
distance_index.get_node_net_handle(n1->id()));
119119

120+
// Node 4 is in snarl 3 to 6 which should be regular.
121+
// The zip codes are going to encode this so it had better be true.
122+
REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n4->id())))));
123+
120124
SECTION ("zip code for node on top-level chain") {
121125
ZipCode zipcode;
122126
zipcode.fill_in_zipcode_from_pos(distance_index, make_pos_t(n1->id(), 0, false));

src/zip_code.cpp

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include "crash.hpp"
2+
13
#include "zip_code.hpp"
24

35
//#define DEBUG_ZIPCODE
@@ -16,10 +18,11 @@ void ZipCode::fill_in_zipcode_from_pos(const SnarlDistanceIndex& distance_index,
1618
//Put all ancestors of the node in a vector, starting from the node, and not including the root
1719
while (!distance_index.is_root(current_handle)) {
1820
ancestors.emplace_back(distance_index.start_end_traversal_of(current_handle));
19-
current_handle = distance_index.get_parent(current_handle);
21+
net_handle_t parent_handle = distance_index.get_parent(current_handle);
22+
crash_unless(parent_handle != current_handle);
23+
current_handle = parent_handle;
2024
}
2125

22-
2326
//Now add the root-level snarl or chain
2427
if (distance_index.is_root_snarl(current_handle)) {
2528
//First thing is a snarl, so add the snarl's connected component number
@@ -1065,11 +1068,7 @@ ZipCode::snarl_code_t ZipCode::get_regular_snarl_code(const net_handle_t& snarl,
10651068
snarl_code.set_code_type(1);
10661069

10671070
//The number of children
1068-
size_t child_count = 0;
1069-
distance_index.for_each_child(snarl, [&] (const net_handle_t& child) {
1070-
child_count++;
1071-
});
1072-
snarl_code.set_child_count(child_count);
1071+
snarl_code.set_child_count(distance_index.get_snarl_child_count(snarl));
10731072

10741073
//Chain prefix sum value for the start of the snarl, which is the prefix sum of the start node + length of the start node
10751074
net_handle_t start_node = distance_index.get_node_from_sentinel(distance_index.get_bound(snarl, false, false));
@@ -1100,11 +1099,7 @@ ZipCode::snarl_code_t ZipCode::get_irregular_snarl_code(const net_handle_t& snar
11001099
snarl_code.set_code_type(distance_index.is_dag(snarl) ? 0 : 2);
11011100

11021101
//The number of children
1103-
size_t child_count = 0;
1104-
distance_index.for_each_child(snarl, [&] (const net_handle_t& child) {
1105-
child_count++;
1106-
});
1107-
snarl_code.set_child_count(child_count);
1102+
snarl_code.set_child_count(distance_index.get_snarl_child_count(snarl));
11081103

11091104
//Chain prefix sum value for the start of the snarl, which is the prefix sum of the start node + length of the start node
11101105
net_handle_t start_node = distance_index.get_node_from_sentinel(distance_index.get_bound(snarl, false, false));

0 commit comments

Comments
 (0)