Skip to content

Commit 6e67634

Browse files
committed
vcf_into_sqlite: make WITHOUT ROWID optional
1 parent c5db76f commit 6e67634

2 files changed

Lines changed: 39 additions & 15 deletions

File tree

loaders/vcf_into_sqlite.cc

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ vector<map<string, string>> extract_hrecs(bcf_hdr_t *hdr, const char *key,
4545
}
4646

4747
string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_hrecs,
48-
vector<map<string, string>> &format_hrecs, int ploidy) {
48+
vector<map<string, string>> &format_hrecs, int ploidy,
49+
bool genotypes_without_rowid) {
4950

5051
OStringStream out;
5152
out << "CREATE TABLE " << table_prefix
@@ -120,7 +121,10 @@ string schemaDDL(const string &table_prefix, vector<map<string, string>> &info_h
120121
}
121122
}
122123
}
123-
out << "\n, PRIMARY KEY (variant_rowid, sample_id)) WITHOUT ROWID";
124+
out << "\n, PRIMARY KEY (variant_rowid, sample_id))";
125+
if (genotypes_without_rowid) {
126+
out << " WITHOUT ROWID";
127+
}
124128
}
125129

126130
return string(out.Get());
@@ -612,30 +616,33 @@ void help() {
612616
<< '\n'
613617
<< "vcf_into_sqlite [options] in.vcf|- out.db" << '\n'
614618
<< "Options: " << '\n'
615-
<< " --table-prefix PREFIX prefix to the name of each table created" << '\n'
616-
<< " --ploidy N set max ploidy => # GT columns (default 2)" << '\n'
617-
<< " --no-gri skip genomic range indexing" << '\n'
618-
<< " -l,--level LEVEL database compression level (-7 to 22, default 6)" << '\n'
619-
<< " -q,--quiet suppress progress information on standard error" << '\n'
620-
<< " -h,--help show this help message" << '\n'
619+
<< " --table-prefix PREFIX prefix to the name of each table created" << '\n'
620+
<< " --ploidy N set max ploidy => # GT columns (default 2)" << '\n'
621+
<< " --genotypes-without-rowid make the genotypes table WITHOUT ROWID (advantageous if there are few/no FORMAT fields)"
622+
<< '\n'
623+
<< " --no-gri skip genomic range indexing" << '\n'
624+
<< " -l,--level LEVEL database compression level (-7 to 22, default 6)" << '\n'
625+
<< " -q,--quiet suppress progress information on standard error" << '\n'
626+
<< " -h,--help show this help message" << '\n'
621627
<< '\n';
622628
}
623629

624630
int main(int argc, char *argv[]) {
625631
string table_prefix, infilename, outfilename;
626-
bool gri = true, progress = true;
632+
bool gri = true, progress = true, genotypes_without_rowid = false;
627633
int level = 6, ploidy = 2;
628634

629635
static struct option long_options[] = {{"help", no_argument, 0, 'h'},
630636
{"quiet", no_argument, 0, 'q'},
631637
{"level", required_argument, 0, 'l'},
632638
{"ploidy", required_argument, 0, 'p'},
639+
{"genotypes-without-rowid", no_argument, 0, 'R'},
633640
{"table-prefix", required_argument, 0, 't'},
634641
{"no-gri", no_argument, 0, 'G'},
635642
{0, 0, 0, 0}};
636643

637644
int c;
638-
while (-1 != (c = getopt_long(argc, argv, "hqGl:t:p:", long_options, nullptr))) {
645+
while (-1 != (c = getopt_long(argc, argv, "hqGRl:t:p:", long_options, nullptr))) {
639646
switch (c) {
640647
case 'h':
641648
help();
@@ -646,6 +653,9 @@ int main(int argc, char *argv[]) {
646653
case 'G':
647654
gri = false;
648655
break;
656+
case 'R':
657+
genotypes_without_rowid = true;
658+
break;
649659
case 't':
650660
table_prefix = optarg;
651661
break;
@@ -751,7 +761,8 @@ int main(int argc, char *argv[]) {
751761

752762
// formulate & apply DDL
753763
// TODO: allow --append
754-
string ddl = schemaDDL(table_prefix, info_hrecs, format_hrecs, ploidy);
764+
string ddl =
765+
schemaDDL(table_prefix, info_hrecs, format_hrecs, ploidy, genotypes_without_rowid);
755766
progress &&cerr << ddl << endl;
756767
db->exec(ddl);
757768

test/genomicsqlite_big_tests.wdl

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
version 1.0
22

3-
workflow genomicsqlite_integration_tests {
3+
workflow genomicsqlite_big_tests {
44
input {
55
String git_revision = "main"
66

@@ -74,6 +74,7 @@ task build {
7474
task test_sam {
7575
input {
7676
File reads
77+
String cram_ref_fa_gz = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz"
7778

7879
File genomicsqlite_py
7980
File libgenomicsqlite_so
@@ -84,8 +85,9 @@ task test_sam {
8485

8586
command <<<
8687
set -euxo pipefail
88+
TMPDIR=${TMPDIR:-/tmp}
8789
apt-get -qq update
88-
DEBIAN_FRONTEND=noninteractive apt-get -qq install -y sqlite3 samtools tabix libzstd1
90+
DEBIAN_FRONTEND=noninteractive apt-get -qq install -y sqlite3 samtools tabix libzstd1 pigz wget
8991
9092
cp ~{genomicsqlite_py} /usr/lib/python3.8/genomicsqlite.py
9193
cp ~{libgenomicsqlite_so} /usr/local/lib/libgenomicsqlite.so
@@ -94,8 +96,19 @@ task test_sam {
9496
cp ~{sam_into_sqlite} /usr/local/bin/sam_into_sqlite
9597
chmod +x /usr/local/bin/sam_into_sqlite
9698
99+
reads_file='~{reads}'
100+
if [[ $reads_file == *.cram ]]; then
101+
# CRAM given; make BAM to take reference downloads out of the timings
102+
wget -nv -O - '~{cram_ref_fa_gz}' | pigz -dc > "${TMPDIR}/cram_ref.fa"
103+
samtools faidx "${TMPDIR}/cram_ref.fa"
104+
bam_file="${TMPDIR}/$(basename reads_file .cram).bam"
105+
time samtools view -T "${TMPDIR}/cram_ref.fa" -h -O BAM -@ 8 "$reads_file" > "$bam_file"
106+
reads_file=$bam_file
107+
fi
108+
>&2 ls -l "$reads_file"
109+
97110
# load database
98-
time sam_into_sqlite "~{reads}" "~{dbname}"
111+
time sam_into_sqlite "$reads_file" "~{dbname}"
99112
>&2 ls -l "~{dbname}"
100113
101114
# GRI query
@@ -168,7 +181,7 @@ task test_vcf {
168181
chr = genomicsqlite.get_reference_sequences_by_name(dbconn)
169182
query = 'SELECT count(*) FROM ' + genomicsqlite.genomic_range_rowids_sql(dbconn, 'variants')
170183
print(query, file=sys.stderr)
171-
row = next(dbconn.execute(query, (chr['chr12'].rid,111803912,111804012))) #(chr['chr21'].rid, 34787801, 35049344)))
184+
row = next(dbconn.execute(query, (chr['chr21'].rid, 34787801, 35049344))) #(chr['chr12'].rid,111803912,111804012)))
172185
print(f'result = {row[0]}', file=sys.stderr)
173186
print(row[0])
174187
EOF

0 commit comments

Comments
 (0)