Skip to content

Commit d29dd6e

Browse files
committed
"safe mode" for range queries
1 parent 0018e52 commit d29dd6e

5 files changed

Lines changed: 48 additions & 4 deletions

File tree

bindings/python/genomicsqlite.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,12 @@ def genomic_range_rowids_sql(
8585
qrid: Optional[str] = None,
8686
qbeg: Optional[str] = None,
8787
qend: Optional[str] = None,
88+
safe: bool = False,
8889
) -> str:
8990
return _execute1(
90-
conn, "SELECT genomic_range_rowids_sql(?,?,?,?)", (indexed_table, qrid, qbeg, qend)
91+
conn,
92+
f"SELECT genomic_range_rowids_{'safe_' if safe else ''}sql(?,?,?,?)",
93+
(indexed_table, qrid, qbeg, qend),
9194
)
9295

9396

include/genomicsqlite.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,32 @@ int genomicsqlite_open(const char *dbfile, sqlite3 **ppDb, int flags, const char
4040
char *genomicsqlite_vacuum_into_sql(const char *destfile, const char *config_json);
4141

4242
/*
43-
* Genomic range indexing
43+
* Generate SQL script to create a genomic range index on the specified table.
44+
* rid: column name for the reference sequence (chromosome) ID of each row
45+
* beg: column name or simple SQL expression for the interval begin position
46+
* end: column name or simple SQL expression for the interval end position
4447
*/
4548
char *create_genomic_range_index_sql(const char *table, const char *rid, const char *beg,
4649
const char *end, int max_depth);
50+
51+
/*
52+
* Generate parenthesized SELECT statement to query the existing genomic range index of the
53+
* specified table. The query finds table rows which overlap the query range, producing one column
54+
* of _rowid_'s.
55+
* dbconn: If provided, the procedure first inspects the index to collect information that may
56+
* allow it to optimize the generated query SQL. Recommended if the generated SQL will be
57+
* used repeatedly, which is the typical case.
58+
* A query optimized in this way must not be used after rows are added or updated, as it's
59+
* then liable to produce incomplete results. The SQL should be regenerated after such
60+
* changes.
61+
* Alternatively, if dbconn is NULL then a less-efficient version of the query is generated
62+
* which is safe to use across updates (also saving the small upfront cost of inspection).
63+
* qrid: query reference sequence (chromosome) ID; defaults to "?1" i.e. the first parameter of the
64+
* compiled statement. One could substitute a different parameter, a constant value, or a SQL
65+
* expression referring to columns of a joined table.
66+
* qbeg: query range begin position; defaults to "?2"
67+
* qend: query range end position; defaults to "?3"
68+
*/
4769
char *genomic_range_rowids_sql(const char *indexed_table, sqlite3 *dbconn, const char *qrid,
4870
const char *qbeg, const char *qend);
4971

@@ -78,6 +100,7 @@ int GenomicSQLiteOpen(const std::string &dbfile, sqlite3 **ppDb, int flags,
78100
std::unique_ptr<SQLite::Database> GenomicSQLiteOpen(const std::string &dbfile, int flags,
79101
const std::string &config_json = "{}");
80102
#endif
103+
81104
std::string GenomicSQLiteVacuumIntoSQL(const std::string &dbfile,
82105
const std::string &config_json = "{}");
83106

@@ -96,6 +119,7 @@ std::string PutGenomicReferenceSequenceSQL(const std::string &name, sqlite3_int6
96119
sqlite3_int64 rid = -1,
97120
const std::string &attached_schema = "");
98121

122+
/* Lookup helpers for stored reference sequence metadata (assumes it's finalized) */
99123
struct gri_refseq_t {
100124
unsigned long long rid, length;
101125
std::string name, assembly, refget_id;

src/genomicsqlite.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,17 @@ static void sqlfn_genomic_range_rowids_sql(sqlite3_context *ctx, int argc, sqlit
646646
GenomicRangeRowidsSQL(indexed_table, sqlite3_context_db_handle(ctx), qrid, qbeg, qend))
647647
}
648648

649+
static void sqlfn_genomic_range_rowids_safe_sql(sqlite3_context *ctx, int argc,
650+
sqlite3_value **argv) {
651+
string indexed_table, qrid = "?1", qbeg = "?2", qend = "?3";
652+
assert(argc >= 1 && argc <= 4);
653+
ARG_TEXT(indexed_table, 0)
654+
ARG_TEXT_OPTIONAL(qrid, 1)
655+
ARG_TEXT_OPTIONAL(qbeg, 2)
656+
ARG_TEXT_OPTIONAL(qend, 3)
657+
SQL_WRAPPER(GenomicRangeRowidsSQL(indexed_table, nullptr, qrid, qbeg, qend))
658+
}
659+
649660
/**************************************************************************************************
650661
* reference sequence metadata (__gri_refseq) helpers
651662
**************************************************************************************************/
@@ -1034,6 +1045,10 @@ static int register_genomicsqlite_functions(sqlite3 *db, const char **pzErrMsg,
10341045
{FPNM(genomic_range_rowids_sql), 2, 0},
10351046
{FPNM(genomic_range_rowids_sql), 3, 0},
10361047
{FPNM(genomic_range_rowids_sql), 4, 0},
1048+
{FPNM(genomic_range_rowids_safe_sql), 1, 0},
1049+
{FPNM(genomic_range_rowids_safe_sql), 2, 0},
1050+
{FPNM(genomic_range_rowids_safe_sql), 3, 0},
1051+
{FPNM(genomic_range_rowids_safe_sql), 4, 0},
10371052
{FPNM(put_genomic_reference_sequence_sql), 2, 0},
10381053
{FPNM(put_genomic_reference_sequence_sql), 3, 0},
10391054
{FPNM(put_genomic_reference_sequence_sql), 4, 0},

test/test_gri.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ def fanout(query):
160160
assert " / 4096)" in query # level 6
161161
assert fanout(query) == 2
162162

163+
assert fanout(genomicsqlite.genomic_range_rowids_sql(con, "features", safe=True)[1:-1]) == 9
164+
163165
con.executescript(
164166
"""
165167
INSERT INTO features VALUES(43, NULL, 10000000000);

test/test_txdb.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,15 @@ def test_txdbquery(genomicsqlite_txdb):
8383
control_query = f"SELECT _rowid_ FROM {tbl} NOT INDEXED WHERE {pfx}_chrom = ? AND NOT ({pfx}_end < ? OR {pfx}_start > ?) ORDER BY _rowid_"
8484

8585
total_results = 0
86-
for _ in range(1000):
86+
for _ in range(2000):
8787
chrom = random.choice(chroms)
8888
beg = random.randint(0, chrom[1] - 65536)
8989
end = beg + random.randint(1, random.choice([16, 256, 4096, 65536]))
9090
ids = list(row[0] for row in conn.execute(query, (chrom[0], beg, end)))
9191
control_ids = list(row[0] for row in conn.execute(control_query, (chrom[0], beg, end)))
9292
assert ids == control_ids
9393
total_results += len(control_ids)
94-
assert total_results in (3802, 1341)
94+
assert total_results in (7341, 2660)
9595

9696
# join cds to exon
9797
cds_exon_counts = (

0 commit comments

Comments
 (0)