|
| 1 | +version 1.0 |
| 2 | + |
| 3 | +workflow genomicsqlite_integration_tests { |
| 4 | + input { |
| 5 | + String git_revision = "main" |
| 6 | + |
| 7 | + File reads = "https://s3.amazonaws.com/1000genomes/1000G_2504_high_coverage/data/ERR3239334/NA12878.final.cram" # !FileCoercion |
| 8 | + File variants = "gs://brain-genomics-public/research/cohort/1KGP/cohort_gatk/CCDG_13607_B01_GRM_WGS_2019-02-19_chr21.recalibrated_variants.vcf.gz" # !FileCoercion |
| 9 | + } |
| 10 | + |
| 11 | + call build { |
| 12 | + input: |
| 13 | + git_revision = git_revision |
| 14 | + } |
| 15 | +
|
| 16 | + call test_sam { |
| 17 | + input: |
| 18 | + reads = reads, |
| 19 | + libgenomicsqlite_so = build.libgenomicsqlite_so, |
| 20 | + genomicsqlite_py = build.genomicsqlite_py, |
| 21 | + sam_into_sqlite = build.sam_into_sqlite |
| 22 | + } |
| 23 | +
|
| 24 | + call test_vcf { |
| 25 | + input: |
| 26 | + variants = variants, |
| 27 | + libgenomicsqlite_so = build.libgenomicsqlite_so, |
| 28 | + genomicsqlite_py = build.genomicsqlite_py, |
| 29 | + vcf_into_sqlite = build.vcf_into_sqlite |
| 30 | + } |
| 31 | +} |
| 32 | +
|
| 33 | +task build { |
| 34 | + input { |
| 35 | + String git_revision |
| 36 | + } |
| 37 | + |
| 38 | + command <<< |
| 39 | + set -euxo pipefail |
| 40 | +
|
| 41 | + apt-get -qq update |
| 42 | + DEBIAN_FRONTEND=noninteractive apt-get -qq install -y \ |
| 43 | + zip pigz wget build-essential git-core cmake libsqlite3-dev libzstd-dev libhts-dev python3-pip samtools tabix |
| 44 | + pip3 install pytest pytest-xdist |
| 45 | +
|
| 46 | + # build libgenomicsqlite.so and loader executables |
| 47 | + git clone --recursive https://github.com/mlin/GenomicSQLite.git |
| 48 | + cd GenomicSQLite |
| 49 | + git checkout ~{git_revision} |
| 50 | + cmake -DCMAKE_BUILD_TYPE=Release -B build . |
| 51 | + cmake --build build -j 8 |
| 52 | + cp bindings/python/genomicsqlite/__init__.py genomicsqlite.py |
| 53 | +
|
| 54 | + # test them |
| 55 | + env -C build "LD_LIBRARY_PATH=$(pwd)" ctest -V >&2 |
| 56 | +
|
| 57 | + # sufficient runtime deps: sqlite3 samtools tabix libzstd1 |
| 58 | + ldd build/loaders/sam_into_sqlite >&2 |
| 59 | + >>> |
| 60 | + |
| 61 | + output { |
| 62 | + File libgenomicsqlite_so = "GenomicSQLite/build/libgenomicsqlite.so" |
| 63 | + File genomicsqlite_py = "GenomicSQLite/genomicsqlite.py" |
| 64 | + File sam_into_sqlite = "GenomicSQLite/build/loaders/sam_into_sqlite" |
| 65 | + File vcf_into_sqlite = "GenomicSQLite/build/loaders/vcf_into_sqlite" |
| 66 | + } |
| 67 | + |
| 68 | + runtime { |
| 69 | + docker: "ubuntu:20.04" |
| 70 | + cpu: 8 |
| 71 | + } |
| 72 | +} |
| 73 | + |
| 74 | +task test_sam { |
| 75 | + input { |
| 76 | + File reads |
| 77 | + |
| 78 | + File genomicsqlite_py |
| 79 | + File libgenomicsqlite_so |
| 80 | + File sam_into_sqlite |
| 81 | + } |
| 82 | + |
| 83 | + String dbname = basename(reads) + ".gsql" |
| 84 | + |
| 85 | + command <<< |
| 86 | + set -euxo pipefail |
| 87 | + apt-get -qq update |
| 88 | + DEBIAN_FRONTEND=noninteractive apt-get -qq install -y sqlite3 samtools tabix libzstd1 |
| 89 | +
|
| 90 | + cp ~{genomicsqlite_py} /usr/lib/python3.8/genomicsqlite.py |
| 91 | + cp ~{libgenomicsqlite_so} /usr/local/lib/libgenomicsqlite.so |
| 92 | + ldconfig |
| 93 | +
|
| 94 | + cp ~{sam_into_sqlite} /usr/local/bin/sam_into_sqlite |
| 95 | + chmod +x /usr/local/bin/sam_into_sqlite |
| 96 | +
|
| 97 | + # load database |
| 98 | + time sam_into_sqlite "~{reads}" "~{dbname}" |
| 99 | + >&2 ls -l "~{dbname}" |
| 100 | +
|
| 101 | + # GRI query |
| 102 | + time python3 - <<"EOF" |
| 103 | + import sys |
| 104 | + import genomicsqlite |
| 105 | + dbconn = genomicsqlite.connect('~{dbname}', read_only=True) |
| 106 | + chr = genomicsqlite.get_reference_sequences_by_name(dbconn) |
| 107 | + query = 'SELECT count(*) FROM ' + genomicsqlite.genomic_range_rowids_sql(dbconn, 'reads') |
| 108 | + print(query, file=sys.stderr) |
| 109 | + row = next(dbconn.execute(query, (chr['chr12'].rid,111803912,111804012))) |
| 110 | + print(f'result = {row[0]}', file=sys.stderr) |
| 111 | + print(row[0]) |
| 112 | + EOF |
| 113 | +
|
| 114 | + # page compression stats |
| 115 | + time sqlite3 "~{dbname}" "SELECT meta1, count(*), avg(length(data)) FROM nested_vfs_zstd_pages GROUP BY meta1" >&2 |
| 116 | +
|
| 117 | + # add a QNAME-sorted seqs table |
| 118 | + chmod +x /usr/lib/python3.8/genomicsqlite.py |
| 119 | + time /usr/lib/python3.8/genomicsqlite.py "~{dbname}" "CREATE TABLE reads_seqs_by_qname AS SELECT * from reads_seqs ORDER BY qname" |
| 120 | + >&2 ls -l "~{dbname}" |
| 121 | + time /usr/lib/python3.8/genomicsqlite.py "~{dbname}" "DROP TABLE reads_seqs_by_qname" |
| 122 | + >&2 ls -l "~{dbname}" |
| 123 | + >>> |
| 124 | + |
| 125 | + output { |
| 126 | + File reads_db = dbname |
| 127 | + Int reads_db_size = round(size(dbname)) |
| 128 | + Int reads_original_size = round(size(reads)) |
| 129 | + } |
| 130 | + |
| 131 | + runtime { |
| 132 | + docker: "ubuntu:20.04" |
| 133 | + cpu: 8 |
| 134 | + } |
| 135 | +} |
| 136 | + |
| 137 | +task test_vcf { |
| 138 | + input { |
| 139 | + File variants |
| 140 | + |
| 141 | + File genomicsqlite_py |
| 142 | + File libgenomicsqlite_so |
| 143 | + File vcf_into_sqlite |
| 144 | + } |
| 145 | + |
| 146 | + String dbname = basename(variants) + ".gsql" |
| 147 | + |
| 148 | + command <<< |
| 149 | + set -euxo pipefail |
| 150 | + apt-get -qq update |
| 151 | + DEBIAN_FRONTEND=noninteractive apt-get -qq install -y sqlite3 samtools tabix libzstd1 |
| 152 | +
|
| 153 | + cp ~{genomicsqlite_py} /usr/lib/python3.8/genomicsqlite.py |
| 154 | + cp ~{libgenomicsqlite_so} /usr/local/lib/libgenomicsqlite.so |
| 155 | + ldconfig |
| 156 | +
|
| 157 | + cp ~{vcf_into_sqlite} /usr/local/bin/vcf_into_sqlite |
| 158 | + chmod +x /usr/local/bin/vcf_into_sqlite |
| 159 | +
|
| 160 | + # load database |
| 161 | + time vcf_into_sqlite "~{variants}" "~{dbname}" |
| 162 | +
|
| 163 | + # GRI query |
| 164 | + time python3 - <<"EOF" |
| 165 | + import sys |
| 166 | + import genomicsqlite |
| 167 | + dbconn = genomicsqlite.connect('~{dbname}', read_only=True) |
| 168 | + chr = genomicsqlite.get_reference_sequences_by_name(dbconn) |
| 169 | + query = 'SELECT count(*) FROM ' + genomicsqlite.genomic_range_rowids_sql(dbconn, 'variants') |
| 170 | + print(query, file=sys.stderr) |
| 171 | + row = next(dbconn.execute(query, (chr['chr12'].rid,111803912,111804012))) #(chr['chr21'].rid, 34787801, 35049344))) |
| 172 | + print(f'result = {row[0]}', file=sys.stderr) |
| 173 | + print(row[0]) |
| 174 | + EOF |
| 175 | +
|
| 176 | + # page compression stats |
| 177 | + time sqlite3 "~{dbname}" "SELECT meta1, count(*), avg(length(data)) FROM nested_vfs_zstd_pages GROUP BY meta1" >&2 |
| 178 | + >>> |
| 179 | + |
| 180 | + output { |
| 181 | + File variants_db = dbname |
| 182 | + Int variants_db_size = round(size(dbname)) |
| 183 | + Int variants_original_size = round(size(variants)) |
| 184 | + } |
| 185 | + |
| 186 | + runtime { |
| 187 | + docker: "ubuntu:20.04" |
| 188 | + cpu: 8 |
| 189 | + } |
| 190 | +} |
0 commit comments