Skip to content

Commit f6926eb

Browse files
committed
add genomicsqlite_big_tests.wdl
1 parent 8f3a1ac commit f6926eb

1 file changed

Lines changed: 190 additions & 0 deletions

File tree

test/genomicsqlite_big_tests.wdl

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
version 1.0
2+
3+
workflow genomicsqlite_integration_tests {
4+
input {
5+
String git_revision = "main"
6+
7+
File reads = "https://s3.amazonaws.com/1000genomes/1000G_2504_high_coverage/data/ERR3239334/NA12878.final.cram" # !FileCoercion
8+
File variants = "gs://brain-genomics-public/research/cohort/1KGP/cohort_gatk/CCDG_13607_B01_GRM_WGS_2019-02-19_chr21.recalibrated_variants.vcf.gz" # !FileCoercion
9+
}
10+
11+
call build {
12+
input:
13+
git_revision = git_revision
14+
}
15+
16+
call test_sam {
17+
input:
18+
reads = reads,
19+
libgenomicsqlite_so = build.libgenomicsqlite_so,
20+
genomicsqlite_py = build.genomicsqlite_py,
21+
sam_into_sqlite = build.sam_into_sqlite
22+
}
23+
24+
call test_vcf {
25+
input:
26+
variants = variants,
27+
libgenomicsqlite_so = build.libgenomicsqlite_so,
28+
genomicsqlite_py = build.genomicsqlite_py,
29+
vcf_into_sqlite = build.vcf_into_sqlite
30+
}
31+
}
32+
33+
task build {
34+
input {
35+
String git_revision
36+
}
37+
38+
command <<<
39+
set -euxo pipefail
40+
41+
apt-get -qq update
42+
DEBIAN_FRONTEND=noninteractive apt-get -qq install -y \
43+
zip pigz wget build-essential git-core cmake libsqlite3-dev libzstd-dev libhts-dev python3-pip samtools tabix
44+
pip3 install pytest pytest-xdist
45+
46+
# build libgenomicsqlite.so and loader executables
47+
git clone --recursive https://github.com/mlin/GenomicSQLite.git
48+
cd GenomicSQLite
49+
git checkout ~{git_revision}
50+
cmake -DCMAKE_BUILD_TYPE=Release -B build .
51+
cmake --build build -j 8
52+
cp bindings/python/genomicsqlite/__init__.py genomicsqlite.py
53+
54+
# test them
55+
env -C build "LD_LIBRARY_PATH=$(pwd)" ctest -V >&2
56+
57+
# sufficient runtime deps: sqlite3 samtools tabix libzstd1
58+
ldd build/loaders/sam_into_sqlite >&2
59+
>>>
60+
61+
output {
62+
File libgenomicsqlite_so = "GenomicSQLite/build/libgenomicsqlite.so"
63+
File genomicsqlite_py = "GenomicSQLite/genomicsqlite.py"
64+
File sam_into_sqlite = "GenomicSQLite/build/loaders/sam_into_sqlite"
65+
File vcf_into_sqlite = "GenomicSQLite/build/loaders/vcf_into_sqlite"
66+
}
67+
68+
runtime {
69+
docker: "ubuntu:20.04"
70+
cpu: 8
71+
}
72+
}
73+
74+
task test_sam {
75+
input {
76+
File reads
77+
78+
File genomicsqlite_py
79+
File libgenomicsqlite_so
80+
File sam_into_sqlite
81+
}
82+
83+
String dbname = basename(reads) + ".gsql"
84+
85+
command <<<
86+
set -euxo pipefail
87+
apt-get -qq update
88+
DEBIAN_FRONTEND=noninteractive apt-get -qq install -y sqlite3 samtools tabix libzstd1
89+
90+
cp ~{genomicsqlite_py} /usr/lib/python3.8/genomicsqlite.py
91+
cp ~{libgenomicsqlite_so} /usr/local/lib/libgenomicsqlite.so
92+
ldconfig
93+
94+
cp ~{sam_into_sqlite} /usr/local/bin/sam_into_sqlite
95+
chmod +x /usr/local/bin/sam_into_sqlite
96+
97+
# load database
98+
time sam_into_sqlite "~{reads}" "~{dbname}"
99+
>&2 ls -l "~{dbname}"
100+
101+
# GRI query
102+
time python3 - <<"EOF"
103+
import sys
104+
import genomicsqlite
105+
dbconn = genomicsqlite.connect('~{dbname}', read_only=True)
106+
chr = genomicsqlite.get_reference_sequences_by_name(dbconn)
107+
query = 'SELECT count(*) FROM ' + genomicsqlite.genomic_range_rowids_sql(dbconn, 'reads')
108+
print(query, file=sys.stderr)
109+
row = next(dbconn.execute(query, (chr['chr12'].rid,111803912,111804012)))
110+
print(f'result = {row[0]}', file=sys.stderr)
111+
print(row[0])
112+
EOF
113+
114+
# page compression stats
115+
time sqlite3 "~{dbname}" "SELECT meta1, count(*), avg(length(data)) FROM nested_vfs_zstd_pages GROUP BY meta1" >&2
116+
117+
# add a QNAME-sorted seqs table
118+
chmod +x /usr/lib/python3.8/genomicsqlite.py
119+
time /usr/lib/python3.8/genomicsqlite.py "~{dbname}" "CREATE TABLE reads_seqs_by_qname AS SELECT * from reads_seqs ORDER BY qname"
120+
>&2 ls -l "~{dbname}"
121+
time /usr/lib/python3.8/genomicsqlite.py "~{dbname}" "DROP TABLE reads_seqs_by_qname"
122+
>&2 ls -l "~{dbname}"
123+
>>>
124+
125+
output {
126+
File reads_db = dbname
127+
Int reads_db_size = round(size(dbname))
128+
Int reads_original_size = round(size(reads))
129+
}
130+
131+
runtime {
132+
docker: "ubuntu:20.04"
133+
cpu: 8
134+
}
135+
}
136+
137+
task test_vcf {
138+
input {
139+
File variants
140+
141+
File genomicsqlite_py
142+
File libgenomicsqlite_so
143+
File vcf_into_sqlite
144+
}
145+
146+
String dbname = basename(variants) + ".gsql"
147+
148+
command <<<
149+
set -euxo pipefail
150+
apt-get -qq update
151+
DEBIAN_FRONTEND=noninteractive apt-get -qq install -y sqlite3 samtools tabix libzstd1
152+
153+
cp ~{genomicsqlite_py} /usr/lib/python3.8/genomicsqlite.py
154+
cp ~{libgenomicsqlite_so} /usr/local/lib/libgenomicsqlite.so
155+
ldconfig
156+
157+
cp ~{vcf_into_sqlite} /usr/local/bin/vcf_into_sqlite
158+
chmod +x /usr/local/bin/vcf_into_sqlite
159+
160+
# load database
161+
time vcf_into_sqlite "~{variants}" "~{dbname}"
162+
163+
# GRI query
164+
time python3 - <<"EOF"
165+
import sys
166+
import genomicsqlite
167+
dbconn = genomicsqlite.connect('~{dbname}', read_only=True)
168+
chr = genomicsqlite.get_reference_sequences_by_name(dbconn)
169+
query = 'SELECT count(*) FROM ' + genomicsqlite.genomic_range_rowids_sql(dbconn, 'variants')
170+
print(query, file=sys.stderr)
171+
row = next(dbconn.execute(query, (chr['chr12'].rid,111803912,111804012))) #(chr['chr21'].rid, 34787801, 35049344)))
172+
print(f'result = {row[0]}', file=sys.stderr)
173+
print(row[0])
174+
EOF
175+
176+
# page compression stats
177+
time sqlite3 "~{dbname}" "SELECT meta1, count(*), avg(length(data)) FROM nested_vfs_zstd_pages GROUP BY meta1" >&2
178+
>>>
179+
180+
output {
181+
File variants_db = dbname
182+
Int variants_db_size = round(size(dbname))
183+
Int variants_original_size = round(size(variants))
184+
}
185+
186+
runtime {
187+
docker: "ubuntu:20.04"
188+
cpu: 8
189+
}
190+
}

0 commit comments

Comments
 (0)