Skip to content

Commit a091dbe

Browse files
committed
tweak vcf schema
1 parent 7009d56 commit a091dbe

4 files changed

Lines changed: 19 additions & 21 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Genomics Extensions for SQLite
1+
# Genomics Extension for SQLite
22

33
![build](https://github.com/mlin/GenomicSQLite/workflows/build/badge.svg?branch=main)
44

loaders/vcf_into_sqlite.cc

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
4949

5050
OStringStream out;
5151
out << "CREATE TABLE " << table_prefix
52-
<< "variants (rowid INTEGER NOT NULL PRIMARY KEY, rid INTEGER NOT NULL REFERENCES __gri_refseq(gri_rid), "
52+
<< "variants (variant_rowid INTEGER NOT NULL PRIMARY KEY, rid INTEGER NOT NULL REFERENCES __gri_refseq(gri_rid), "
5353
"POS INTEGER NOT NULL, rlen INTEGER NOT NULL, ID_jsarray TEXT, REF TEXT NOT NULL, "
5454
"ALT_jsarray TEXT, QUAL REAL, FILTER_jsarray";
5555

@@ -82,11 +82,11 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
8282
if (!format_hrecs.empty()) {
8383
// TODO: include metadata from SAMPLE header lines
8484
out << ";\nCREATE TABLE " << table_prefix
85-
<< "samples (rowid INTEGER NOT NULL PRIMARY KEY, id TEXT NOT NULL)";
85+
<< "samples (sample_id INTEGER NOT NULL PRIMARY KEY, sample_name TEXT NOT NULL)";
8686
out << ";\nCREATE TABLE " << table_prefix
87-
<< "genotypes (variant INTEGER NOT NULL REFERENCES " << table_prefix
88-
<< "variants(rowid), sample INTEGER NOT NULL REFERENCES " << table_prefix
89-
<< "samples(rowid)";
87+
<< "genotypes (variant_rowid INTEGER NOT NULL REFERENCES " << table_prefix
88+
<< "variants(variant_rowid), sample_id INTEGER NOT NULL REFERENCES " << table_prefix
89+
<< "samples(sample_id)";
9090

9191
// FORMAT columns
9292
for (auto &hrec : format_hrecs) {
@@ -120,7 +120,7 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
120120
}
121121
}
122122
}
123-
out << "\n, PRIMARY KEY (variant, sample)) WITHOUT ROWID";
123+
out << "\n, PRIMARY KEY (variant_rowid, sample_id)) WITHOUT ROWID";
124124
}
125125

126126
return string(out.Get());
@@ -156,7 +156,8 @@ unique_ptr<SQLite::Statement> prepare_insert_genotype(const string &table_prefix
156156
}
157157

158158
void insert_samples(bcf_hdr_t *hdr, const string &table_prefix, SQLite::Database &db) {
159-
SQLite::Statement stmt(db, "INSERT INTO " + table_prefix + "samples(rowid,id) VALUES(?,?)");
159+
SQLite::Statement stmt(db, "INSERT INTO " + table_prefix +
160+
"samples(sample_id,sample_name) VALUES(?,?)");
160161

161162
for (sqlite3_int64 i = 0; i < bcf_hdr_nsamples(hdr); ++i) {
162163
stmt.bind(1, i);

test/test_bam.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@ def test_bam(tmp_path):
2121
bamfile = os.path.join(HERE, "data/NA12878.chr21:20000000-22500000.bam")
2222
dbfile = str(tmp_path / "test.bam.sqlite")
2323

24-
sam_into_sqlite(
25-
bamfile,
26-
str(dbfile),
27-
"--table-prefix",
28-
"NA12878_"
29-
)
24+
sam_into_sqlite(bamfile, str(dbfile), "--table-prefix", "NA12878_")
3025

3126
con = genomicsqlite.connect(dbfile, read_only=True)
3227

@@ -36,10 +31,14 @@ def test_bam(tmp_path):
3631
count = next(con.execute("SELECT COUNT(DISTINCT qname) FROM NA12878_reads_seqs"))[0]
3732
assert count == 299205
3833

39-
mq_hist = dict(con.execute("""
34+
mq_hist = dict(
35+
con.execute(
36+
"""
4037
SELECT mq, COUNT(*) as count FROM
4138
(SELECT ifnull(json_extract(tags_json, '$.MQ'),0) AS mq
4239
FROM NA12878_reads NATURAL JOIN NA12878_reads_tags WHERE (flag & 3840) = 0)
4340
GROUP BY mq ORDER BY mq DESC
44-
"""))
45-
assert (mq_hist[0],mq_hist[60]) == (2734,520522)
41+
"""
42+
)
43+
)
44+
assert (mq_hist[0], mq_hist[60]) == (2734, 520522)

test/test_vcf.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_gnomad_sites_small(tmp_path):
3030
con = genomicsqlite.connect(dbfile, read_only=True)
3131

3232
query = (
33-
"SELECT gnomad_variants._rowid_, id_jsarray FROM (SELECT gri_rid FROM __gri_refseq WHERE gri_refseq_name=?1) AS chrom, gnomad_variants WHERE gnomad_variants._rowid_ IN"
33+
"SELECT variant_rowid, id_jsarray FROM (SELECT gri_rid FROM __gri_refseq WHERE gri_refseq_name=?1) AS chrom, gnomad_variants WHERE variant_rowid IN"
3434
+ genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants", "chrom.gri_rid")
3535
)
3636
rs671 = ("chr12", 111803912, 111804012)
@@ -57,9 +57,7 @@ def test_gnomad_sites_small(tmp_path):
5757
results_rowids = set(vt[0] for vt in results)
5858
assert next(vt for vt in results if vt[1] and "rs671" in vt[1])
5959

60-
control = (
61-
"SELECT gnomad_variants.rowid FROM gnomad_variants WHERE NOT ((pos+rlen) < ? OR pos > ?)"
62-
)
60+
control = "SELECT variant_rowid FROM gnomad_variants WHERE NOT ((pos+rlen) < ? OR pos > ?)"
6361
control_rowids = set(vt[0] for vt in con.execute(control, (rs671[1], rs671[2])))
6462
assert len(control_rowids) == 22
6563
assert results_rowids == control_rowids

0 commit comments

Comments
 (0)