Skip to content

Commit 83e1d6c

Browse files
committed
optimize GRI depth detection
1 parent 662f5e0 commit 83e1d6c

2 files changed

Lines changed: 68 additions & 46 deletions

File tree

loaders/vcf_into_sqlite.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
6767
out << ",Description=" << desc->second;
6868
}
6969
}
70-
out << "\n, FOREIGN KEY (rid) REFERENCES __gri_refseq(rid))";
70+
out << "\n, FOREIGN KEY (rid) REFERENCES __gri_refseq(gri_rid))";
7171

7272
if (!format_hrecs.empty()) {
7373
// TODO: include metadata from SAMPLE header lines

src/genomicsqlite.cc

Lines changed: 67 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ const sqlite3_int64 GRI_BIN_OFFSETS[] = {
393393
1 + 16 + 256 + 4096 + 65536 + 1048576 + 16777216,
394394
1 + 16 + 256 + 4096 + 65336 + 1048576 + 16777216 + 268435456,
395395
};
396-
const sqlite_int64 GRI_POS_OFFSETS[] = {
396+
const sqlite3_int64 GRI_POS_OFFSETS[] = {
397397
0, 134217728, 8388608, 524288, 32768, 2048, 128, 8, 0,
398398
};
399399
const sqlite3_int64 GRI_BIN_COUNT = GRI_BIN_OFFSETS[GRI_MAX_LEVEL] + 4294967296LL;
@@ -495,66 +495,88 @@ static void sqlfn_create_genomic_range_index_sql(sqlite3_context *ctx, int argc,
495495
SQL_WRAPPER(CreateGenomicRangeIndexSQL(schema_table, rid, beg, end, max_depth))
496496
}
497497

498-
struct gri_properties {
498+
static int gri_bin_depth(sqlite3_int64 bin) {
499+
assert(bin >= 0 && bin < GRI_BIN_COUNT);
500+
for (int lv = 0; lv < GRI_MAX_LEVEL; ++lv) {
501+
if (bin < GRI_BIN_OFFSETS[lv + 1]) {
502+
return lv;
503+
}
504+
}
505+
return GRI_MAX_LEVEL;
506+
}
507+
508+
struct gri_depth_range_t {
499509
int min_depth = 0, max_depth = GRI_MAX_LEVEL;
500510
};
501511

502-
static gri_properties InspectGRI(sqlite3 *dbconn, const string &schema_table) {
503-
// find the range of nonempty bin levels [min_depth..max_depth]
504-
gri_properties ans;
512+
static gri_depth_range_t DetectDepthRange(sqlite3 *dbconn, const string &schema_table) {
505513
string table = split_schema_table(schema_table).second;
506-
// convoluted query ensures it "skip-scans" the index without requiring ANALYZE
507-
string query = "SELECT _rowid_ FROM " + schema_table + " INDEXED BY " + table +
508-
"__gri WHERE _gri_rid IN (SELECT DISTINCT _gri_rid FROM " + schema_table +
509-
" INDEXED BY " + table + "__gri) AND _gri_bin >= ? LIMIT 1";
514+
515+
// Detect min & max bin depth (level) occupied in the table's GRI. Since bin numbers increase
516+
// with depth, we can find the min and max bin numbers & then figure their respective depths.
517+
//
518+
// We'd like to write simply SELECT MIN(_gri_bin), MAX(_gri_bin) ... and trust SQLite to plan
519+
// an efficient skip-scan of the GRI on (_gri_rid, _gri_bin, ...). Unfortunately it doesn't do
520+
// that, so instead we write some convoluted SQL that forces the efficient plan.
521+
//
522+
// This consists of --
523+
// (i) recursive CTE to find the set of relevant _gri_rid (because even
524+
// SELECT DISTINCT _gri_rid ... triggers a full index scan)
525+
// (ii) for each _gri_rid: pick out the min/max bins with ORDER BY _gri_bin [DESC] LIMIT 1
526+
// (iii) min() and max() over the per-rid answers
527+
// We do the (iii) aggregation externally to ensure SQLite only does one pass through the index
528+
529+
string tbl_gri = schema_table + " INDEXED BY " + table + "__gri";
530+
string query =
531+
"WITH RECURSIVE __distinct(__rid) AS\n"
532+
" (SELECT (SELECT _gri_rid FROM " +
533+
tbl_gri +
534+
" ORDER BY _gri_rid NULLS LAST LIMIT 1) AS __rid_0 WHERE __rid_0 IS NOT NULL\n"
535+
" UNION ALL\n"
536+
" SELECT (SELECT _gri_rid FROM " +
537+
tbl_gri +
538+
" WHERE _gri_rid > __rid ORDER BY _gri_rid LIMIT 1) AS __rid_i FROM __distinct WHERE __rid_i IS NOT NULL)\n"
539+
"SELECT\n"
540+
" (SELECT _gri_bin FROM " +
541+
tbl_gri +
542+
" WHERE _gri_rid = __rid AND _gri_bin >= 0 ORDER BY _gri_rid, _gri_bin LIMIT 1),\n"
543+
" (SELECT _gri_bin FROM " +
544+
tbl_gri +
545+
" WHERE _gri_rid = __rid AND _gri_bin >= 0 ORDER BY _gri_rid DESC, _gri_bin DESC LIMIT 1)\n"
546+
"FROM __distinct";
547+
_DBG << endl << query << endl;
510548
shared_ptr<sqlite3_stmt> stmt;
511549
{
512550
sqlite3_stmt *pStmt = nullptr;
513551
if (sqlite3_prepare_v3(dbconn, query.c_str(), -1, 0, &pStmt, nullptr) != SQLITE_OK) {
552+
throw runtime_error(sqlite3_errmsg(dbconn));
514553
throw runtime_error("GenomicSQLite: table has no genomic range index");
515554
}
516555
stmt = shared_ptr<sqlite3_stmt>(pStmt, sqlite3_finalize);
517556
}
518-
for (ans.max_depth = GRI_MAX_LEVEL; ans.max_depth > 0; --(ans.max_depth)) {
519-
if (sqlite3_bind_int64(stmt.get(), 1, GRI_BIN_OFFSETS[ans.max_depth]) != SQLITE_OK) {
520-
throw runtime_error("GenomicSQLite: error inspecting genomic range index");
521-
}
522-
int rc = sqlite3_step(stmt.get());
523-
if (rc == SQLITE_ROW && sqlite3_column_type(stmt.get(), 0) == SQLITE_INTEGER) {
524-
break;
557+
558+
sqlite3_int64 min_bin = GRI_BIN_COUNT, max_bin = -1;
559+
int rc;
560+
while ((rc = sqlite3_step(stmt.get())) == SQLITE_ROW) {
561+
if (sqlite3_column_type(stmt.get(), 0) == SQLITE_INTEGER) {
562+
min_bin = min(min_bin, sqlite3_column_int64(stmt.get(), 0));
525563
}
526-
if ((rc != SQLITE_ROW && rc != SQLITE_DONE) || sqlite3_reset(stmt.get()) != SQLITE_OK) {
527-
throw runtime_error("GenomicSQLite: error inspecting genomic range index");
564+
if (sqlite3_column_type(stmt.get(), 1) == SQLITE_INTEGER) {
565+
max_bin = max(max_bin, sqlite3_column_int64(stmt.get(), 1));
528566
}
529567
}
568+
if (rc != SQLITE_DONE) {
569+
throw runtime_error("GenomicSQLite: error inspecting genomic range index");
570+
}
530571

531-
stmt.reset();
532-
query = "SELECT _rowid_ FROM " + schema_table + " INDEXED BY " + table +
533-
"__gri WHERE _gri_rid IN (SELECT DISTINCT _gri_rid FROM " + schema_table +
534-
" INDEXED BY " + table + "__gri) AND _gri_bin < ? LIMIT 1";
535-
{
536-
sqlite3_stmt *pStmt = nullptr;
537-
if (sqlite3_prepare_v3(dbconn, query.c_str(), -1, 0, &pStmt, nullptr) != SQLITE_OK) {
538-
throw runtime_error("GenomicSQLite: table has no genomic range index");
539-
}
540-
stmt = shared_ptr<sqlite3_stmt>(pStmt, sqlite3_finalize);
572+
// set min/max depth based on min/max bin
573+
gri_depth_range_t ans;
574+
if (min_bin < GRI_BIN_COUNT) {
575+
ans.min_depth = gri_bin_depth(min_bin);
541576
}
542-
for (ans.min_depth = 0; ans.min_depth < ans.max_depth; ++(ans.min_depth)) {
543-
if (sqlite3_bind_int64(stmt.get(), 1,
544-
(ans.min_depth < GRI_MAX_LEVEL) ? GRI_BIN_OFFSETS[ans.min_depth + 1]
545-
: GRI_BIN_COUNT) != SQLITE_OK) {
546-
throw runtime_error("GenomicSQLite: error inspecting genomic range index");
547-
}
548-
int rc = sqlite3_step(stmt.get());
549-
if (rc == SQLITE_ROW && sqlite3_column_type(stmt.get(), 0) == SQLITE_INTEGER) {
550-
break;
551-
}
552-
if (rc != SQLITE_ROW && rc != SQLITE_DONE && rc != SQLITE_OK ||
553-
sqlite3_reset(stmt.get()) != SQLITE_OK) {
554-
throw runtime_error("GenomicSQLite: error inspecting genomic range index");
555-
}
577+
if (max_bin >= 0) {
578+
ans.max_depth = gri_bin_depth(max_bin);
556579
}
557-
558580
assert(ans.min_depth >= 0 && ans.min_depth <= ans.max_depth && ans.max_depth < GRI_LEVELS);
559581
return ans;
560582
}
@@ -585,9 +607,9 @@ static string FilterTerm(const string &indexed_table, const string &qbegs, const
585607

586608
string GenomicRangeRowidsSQL(const string &indexed_table, sqlite3 *dbconn, const string &qrid,
587609
const string &qbeg, const string &qend) {
588-
gri_properties table_gri;
610+
gri_depth_range_t table_gri;
589611
if (dbconn) {
590-
table_gri = InspectGRI(dbconn, indexed_table);
612+
table_gri = DetectDepthRange(dbconn, indexed_table);
591613
}
592614
string table = split_schema_table(indexed_table).second;
593615

0 commit comments

Comments
 (0)