Skip to content

Commit f3f8b50

Browse files
committed
_gri_rid and JSON schema tweaks
1 parent a091dbe commit f3f8b50

7 files changed

Lines changed: 50 additions & 38 deletions

File tree

bindings/python/genomicsqlite.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import sys
77
import json
88
import sqlite3
9-
from typing import Optional, NamedTuple, Dict
9+
from typing import Optional, NamedTuple, Dict, Any
1010
from ctypes.util import find_library
1111

1212
# One-time global initialization -- load the extension shared-library
@@ -106,13 +106,15 @@ def put_reference_sequence_sql(
106106
length: int,
107107
assembly: Optional[str] = None,
108108
refget_id: Optional[str] = None,
109+
meta: Optional[Dict[str, Any]] = None,
109110
rid: Optional[int] = None,
110111
schema: Optional[str] = None,
111112
):
113+
meta_json = json.dumps(meta) if meta else None
112114
return _execute1(
113115
conn,
114-
"SELECT put_genomic_reference_sequence_sql(?,?,?,?,?,?)",
115-
(name, length, assembly, refget_id, rid, schema),
116+
"SELECT put_genomic_reference_sequence_sql(?,?,?,?,?,?,?)",
117+
(name, length, assembly, refget_id, meta_json, rid, schema),
116118
)
117119

118120

@@ -131,7 +133,7 @@ def get_reference_sequences_by_rid(
131133
if schema:
132134
table = f"{schema}.{table}"
133135
sql = (
134-
"SELECT gri_rid, gri_refseq_name, gri_refseq_length, gri_assembly, gri_refget_id FROM "
136+
"SELECT _gri_rid, gri_refseq_name, gri_refseq_length, gri_assembly, gri_refget_id FROM "
135137
+ table
136138
)
137139
params = []

include/genomicsqlite.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ char *genomic_range_rowids_sql(const char *indexed_table, sqlite3 *dbconn, const
7575
char *put_genomic_reference_assembly_sql(const char *assembly, const char *attached_schema);
7676
char *put_genomic_reference_sequence_sql(const char *name, sqlite3_int64 length,
7777
const char *assembly, const char *refget_id,
78-
sqlite3_int64 rid, const char *attached_schema);
78+
const char *meta_json, sqlite3_int64 rid,
79+
const char *attached_schema);
7980

8081
/*
8182
* C++ bindings: are liable to throw exceptions except where marked noexcept
@@ -116,6 +117,7 @@ std::string PutGenomicReferenceAssemblySQL(const std::string &assembly,
116117
std::string PutGenomicReferenceSequenceSQL(const std::string &name, sqlite3_int64 length,
117118
const std::string &assembly = "",
118119
const std::string &refget_id = "",
120+
const std::string &meta_json = "{}",
119121
sqlite3_int64 rid = -1,
120122
const std::string &attached_schema = "");
121123

loaders/sam_into_sqlite.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ map<string, int> import_readgroups(const string &table_prefix, sam_hdr_t *hdr, S
3030
bool progress) {
3131
string ddl =
3232
"CREATE TABLE " + table_prefix +
33-
"readgroups(rg_id INTEGER PRIMARY KEY, rg_name TEXT NOT NULL UNIQUE, rg_tags_json TEXT NOT NULL)";
33+
"readgroups(rg_id INTEGER PRIMARY KEY, rg_name TEXT NOT NULL UNIQUE, rg_tags_json TEXT NOT NULL DEFAULT '{}')";
3434
progress &&cerr << ddl << endl;
3535
db.exec(ddl);
3636

@@ -228,7 +228,7 @@ int main(int argc, char *argv[]) {
228228
if (sam_hdr_find_tag_pos(hdr.get(), "SQ", rid, "M5", ks.get()) == 0) {
229229
m5 = ks->s;
230230
}
231-
string sql = PutGenomicReferenceSequenceSQL(name, length, "", m5, rid);
231+
string sql = PutGenomicReferenceSequenceSQL(name, length, "", m5, "{}", rid);
232232
if (rid == 0) {
233233
progress &&cerr << sql << endl;
234234
} else if (rid == 1) {
@@ -244,7 +244,7 @@ int main(int argc, char *argv[]) {
244244
// TODO: allow --append
245245
string ddl =
246246
"CREATE TABLE " + table_prefix +
247-
"reads(rowid INTEGER PRIMARY KEY, flag INTEGER NOT NULL, rid INTEGER REFERENCES __gri_refseq(gri_rid), pos INTEGER, endpos INTEGER, "
247+
"reads(rowid INTEGER PRIMARY KEY, flag INTEGER NOT NULL, rid INTEGER REFERENCES __gri_refseq(_gri_rid), pos INTEGER, endpos INTEGER, "
248248
"mapq INTEGER, cigar TEXT, rnext INTEGER, pnext INTEGER, tlen INTEGER, "
249249
"rg_id INTEGER REFERENCES " +
250250
table_prefix + "readgroups(rg_id))";
@@ -253,7 +253,7 @@ int main(int argc, char *argv[]) {
253253
"reads(rowid), qname TEXT, seq TEXT, qual TEXT)";
254254
ddl += ";\nCREATE TABLE " + table_prefix +
255255
"reads_tags(rowid INTEGER PRIMARY KEY REFERENCES " + table_prefix +
256-
"reads(rowid), tags_json TEXT)";
256+
"reads(rowid), tags_json TEXT NOT NULL DEFAULT '{}')";
257257

258258
progress &&cerr << ddl << endl;
259259
db->exec(ddl);

loaders/vcf_into_sqlite.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
4949

5050
OStringStream out;
5151
out << "CREATE TABLE " << table_prefix
52-
<< "variants (variant_rowid INTEGER NOT NULL PRIMARY KEY, rid INTEGER NOT NULL REFERENCES __gri_refseq(gri_rid), "
53-
"POS INTEGER NOT NULL, rlen INTEGER NOT NULL, ID_jsarray TEXT, REF TEXT NOT NULL, "
54-
"ALT_jsarray TEXT, QUAL REAL, FILTER_jsarray";
52+
<< "variants (variant_rowid INTEGER NOT NULL PRIMARY KEY, rid INTEGER NOT NULL REFERENCES __gri_refseq(_gri_rid), "
53+
"POS INTEGER NOT NULL, rlen INTEGER NOT NULL, ID_jsarray TEXT DEFAULT '[]', REF TEXT NOT NULL, "
54+
"ALT_jsarray TEXT NOT NULL DEFAULT '[]', QUAL REAL, FILTER_jsarray";
5555

5656
// INFO columns
5757
for (auto &hrec : info_hrecs) {
@@ -60,7 +60,7 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
6060
out << " INTEGER NOT NULL";
6161
} else if (hrec["Number"] != "1" &&
6262
(hrec["Type"] == "Integer" || hrec["Type"] == "Float")) {
63-
out << "_jsarray TEXT";
63+
out << "_jsarray TEXT DEFAULT '[]'";
6464
} else if (hrec["Type"] == "Integer") {
6565
out << " INTEGER";
6666
} else if (hrec["Type"] == "Float") {
@@ -102,7 +102,7 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
102102
out << "\n, " << hrec["ID"];
103103
if (hrec["Number"] != "1" &&
104104
(hrec["Type"] == "Integer" || hrec["Type"] == "Float")) {
105-
out << "_jsarray TEXT";
105+
out << "_jsarray TEXT DEFAULT '[]'";
106106
} else if (hrec["Type"] == "Integer") {
107107
out << " INTEGER";
108108
} else if (hrec["Type"] == "Float") {
@@ -736,7 +736,7 @@ int main(int argc, char *argv[]) {
736736
if (length <= 0 || errno) {
737737
throw runtime_error("invalid contig length in VCF header");
738738
}
739-
string sql = PutGenomicReferenceSequenceSQL(ctg["ID"], length, assembly, "", rid);
739+
string sql = PutGenomicReferenceSequenceSQL(ctg["ID"], length, assembly, "", "{}", rid);
740740
if (rid == 0) {
741741
progress &&cerr << sql << endl;
742742
} else if (rid == 1) {

src/genomicsqlite.cc

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,9 @@ static string gri_refseq_ddl(const string &schema) {
434434
}
435435
ostringstream out;
436436
out << "CREATE TABLE IF NOT EXISTS " << schema_prefix << "__gri_refseq"
437-
<< "(gri_rid INTEGER NOT NULL PRIMARY KEY, gri_refseq_name TEXT NOT NULL, gri_assembly TEXT,"
438-
<< " gri_refget_id TEXT UNIQUE, gri_refseq_length INTEGER NOT NULL, UNIQUE(gri_assembly,gri_refseq_name))"
437+
<< "(_gri_rid INTEGER NOT NULL PRIMARY KEY, gri_refseq_name TEXT NOT NULL, gri_assembly TEXT,"
438+
<< " gri_refget_id TEXT UNIQUE, gri_refseq_length INTEGER NOT NULL, gri_refseq_meta_json TEXT NOT NULL DEFAULT '{}', "
439+
<< "UNIQUE(gri_assembly,gri_refseq_name))"
439440
<< ";\nCREATE INDEX IF NOT EXISTS " << schema_prefix << "__gri_refseq_name ON "
440441
<< schema_prefix << "__gri_refseq(gri_refseq_name)";
441442
return out.str();
@@ -663,7 +664,8 @@ static void sqlfn_genomic_range_rowids_safe_sql(sqlite3_context *ctx, int argc,
663664

664665
string PutGenomicReferenceSequenceSQL(const string &name, sqlite3_int64 length,
665666
const string &assembly, const string &refget_id,
666-
sqlite3_int64 rid, const string &schema, bool with_ddl) {
667+
const string &meta_json, sqlite3_int64 rid,
668+
const string &schema, bool with_ddl) {
667669
string schema_prefix;
668670
if (!schema.empty()) {
669671
schema_prefix = schema + ".";
@@ -673,40 +675,45 @@ string PutGenomicReferenceSequenceSQL(const string &name, sqlite3_int64 length,
673675
out << gri_refseq_ddl(schema) << ";\n";
674676
}
675677
out << "INSERT INTO " << schema_prefix
676-
<< "__gri_refseq(gri_rid,gri_refseq_name,gri_assembly,gri_refget_id,gri_refseq_length) VALUES("
678+
<< "__gri_refseq(_gri_rid,gri_refseq_name,gri_assembly,gri_refget_id,gri_refseq_length,gri_refseq_meta_json) VALUES("
677679
<< (rid >= 0 ? std::to_string(rid) : "NULL") << "," << sqlquote(name) << ","
678680
<< (assembly.empty() ? "NULL" : sqlquote(assembly)) << ","
679681
<< (refget_id.empty() ? "NULL" : sqlquote(refget_id)) << "," << std::to_string(length)
680-
<< ")";
682+
<< "," << sqlquote(meta_json.empty() ? string("{}") : meta_json) << ")";
681683
return out.str();
682684
}
683685

684686
string PutGenomicReferenceSequenceSQL(const string &name, sqlite3_int64 length,
685687
const string &assembly, const string &refget_id,
686-
sqlite3_int64 rid, const string &schema) {
687-
return PutGenomicReferenceSequenceSQL(name, length, assembly, refget_id, rid, schema, true);
688+
const string &meta_json, sqlite3_int64 rid,
689+
const string &schema) {
690+
return PutGenomicReferenceSequenceSQL(name, length, assembly, refget_id, meta_json, rid, schema,
691+
true);
688692
}
689693

690694
extern "C" char *put_genomic_reference_sequence_sql(const char *name, sqlite3_int64 length,
691695
const char *assembly, const char *refget_id,
692-
sqlite3_int64 rid, const char *schema) {
693-
C_WRAPPER(PutGenomicReferenceSequenceSQL(name, length, assembly ? assembly : "",
694-
refget_id ? refget_id : "", rid, schema ? schema : "",
695-
true));
696+
const char *meta_json, sqlite3_int64 rid,
697+
const char *schema) {
698+
C_WRAPPER(PutGenomicReferenceSequenceSQL(
699+
name, length, assembly ? assembly : "", refget_id ? refget_id : "",
700+
meta_json ? meta_json : "{}", rid, schema ? schema : "", true));
696701
}
697702

698703
static void sqlfn_put_genomic_reference_sequence_sql(sqlite3_context *ctx, int argc,
699704
sqlite3_value **argv) {
700-
string name, assembly, refget_id, schema;
705+
string name, assembly, refget_id, meta_json = "{}", schema;
701706
sqlite3_int64 length, rid = -1;
702-
assert(argc >= 2 && argc <= 6);
707+
assert(argc >= 2 && argc <= 7);
703708
ARG_TEXT(name, 0)
704709
ARG(length, 1, SQLITE_INTEGER, int64)
705710
ARG_TEXT_OPTIONAL(assembly, 2)
706711
ARG_TEXT_OPTIONAL(refget_id, 3)
707-
ARG_OPTIONAL(rid, 4, SQLITE_INTEGER, int64)
708-
ARG_TEXT_OPTIONAL(schema, 5);
709-
SQL_WRAPPER(PutGenomicReferenceSequenceSQL(name, length, assembly, refget_id, rid, schema))
712+
ARG_TEXT_OPTIONAL(meta_json, 4)
713+
ARG_OPTIONAL(rid, 5, SQLITE_INTEGER, int64)
714+
ARG_TEXT_OPTIONAL(schema, 6);
715+
SQL_WRAPPER(
716+
PutGenomicReferenceSequenceSQL(name, length, assembly, refget_id, meta_json, rid, schema))
710717
}
711718

712719
struct hardcoded_refseq_t {
@@ -939,7 +946,7 @@ string PutGenomicReferenceAssemblySQL(const string &assembly, const string &sche
939946
const hardcoded_refseq_t &hcrs = hardcoded_refseqs[i];
940947
out << PutGenomicReferenceSequenceSQL(string(hcrs.name), hcrs.length, assembly,
941948
hcrs.refget_id ? string(hcrs.refget_id) : string(),
942-
-1, schema, i == 0);
949+
string("{}"), -1, schema, i == 0);
943950
}
944951
return out.str();
945952
}
@@ -963,7 +970,7 @@ GetGenomicReferenceSequencesByRid(sqlite3 *dbconn, const string &assembly, const
963970
string schema_prefix = schema.empty() ? "" : (schema + ".");
964971

965972
string query =
966-
"SELECT gri_rid, gri_refseq_name, gri_refseq_length, gri_assembly, gri_refget_id FROM " +
973+
"SELECT _gri_rid, gri_refseq_name, gri_refseq_length, gri_assembly, gri_refget_id FROM " +
967974
schema_prefix + "__gri_refseq";
968975
if (!assembly.empty()) {
969976
query += " WHERE gri_assembly = ?";
@@ -1054,6 +1061,7 @@ static int register_genomicsqlite_functions(sqlite3 *db, const char **pzErrMsg,
10541061
{FPNM(put_genomic_reference_sequence_sql), 4, 0},
10551062
{FPNM(put_genomic_reference_sequence_sql), 5, 0},
10561063
{FPNM(put_genomic_reference_sequence_sql), 6, 0},
1064+
{FPNM(put_genomic_reference_sequence_sql), 7, 0},
10571065
{FPNM(put_genomic_reference_assembly_sql), 1, 0},
10581066
{FPNM(put_genomic_reference_assembly_sql), 2, 0}};
10591067

test/test_gri.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def test_refseq():
218218
assert len(refseq_by_rid) == 195
219219

220220
query = (
221-
"SELECT gri_rid, beg, end, id FROM __gri_refseq, exons WHERE gri_refseq_name = rid AND exons._rowid_ IN "
221+
"SELECT __gri_refseq._gri_rid, rid, beg, end, id FROM exons, __gri_refseq WHERE exons.rid = gri_refseq_name AND exons._rowid_ IN "
222222
+ genomicsqlite.genomic_range_rowids_sql(con, "exons")
223223
)
224224
print("\n" + query)

test/test_vcf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def test_gnomad_sites_small(tmp_path):
3030
con = genomicsqlite.connect(dbfile, read_only=True)
3131

3232
query = (
33-
"SELECT variant_rowid, id_jsarray FROM (SELECT gri_rid FROM __gri_refseq WHERE gri_refseq_name=?1) AS chrom, gnomad_variants WHERE variant_rowid IN"
34-
+ genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants", "chrom.gri_rid")
33+
"SELECT variant_rowid, id_jsarray FROM (SELECT _gri_rid AS rid FROM __gri_refseq WHERE gri_refseq_name=?1) AS query, gnomad_variants WHERE variant_rowid IN"
34+
+ genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants", "query.rid")
3535
)
3636
rs671 = ("chr12", 111803912, 111804012)
3737
print(query)
@@ -57,8 +57,8 @@ def test_gnomad_sites_small(tmp_path):
5757
results_rowids = set(vt[0] for vt in results)
5858
assert next(vt for vt in results if vt[1] and "rs671" in vt[1])
5959

60-
control = "SELECT variant_rowid FROM gnomad_variants WHERE NOT ((pos+rlen) < ? OR pos > ?)"
61-
control_rowids = set(vt[0] for vt in con.execute(control, (rs671[1], rs671[2])))
60+
control = "SELECT variant_rowid FROM gnomad_variants NATURAL JOIN __gri_refseq WHERE gri_refseq_name = ? AND NOT ((pos+rlen) < ? OR pos > ?)"
61+
control_rowids = set(vt[0] for vt in con.execute(control, rs671))
6262
assert len(control_rowids) == 22
6363
assert results_rowids == control_rowids
6464

0 commit comments

Comments
 (0)