Skip to content

Commit adad1df

Browse files
committed
add in-SQL dna_revcomp()
1 parent f8b8e71 commit adad1df

6 files changed

Lines changed: 136 additions & 17 deletions

File tree

docs/guide.md

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,9 +1026,20 @@ But this plan strongly depends on the contiguity assumption.
10261026
/* genomicsqlite_open("compressed.db", ...); */
10271027
```
10281028

1029-
#### Parse genomic range string
1029+
#### DNA reverse complement
10301030

1031-
These SQL functions process a string like "chr1:2,345-6,789" into its three parts (sequence/chromosome name, begin position, and end position).
1031+
Reverse-complements a DNA text value (containing only characters from the set `AGCTagct`), preserving original case.
1032+
1033+
=== "SQL"
1034+
``` sql
1035+
SELECT dna_revcomp('AGCTagct') -- 'agctAGCT'
1036+
```
1037+
1038+
Given NULL, returns NULL. Any other input is an error.
1039+
1040+
#### Parse genomic range text
1041+
1042+
These SQL functions process a text value like `'chr1:2,345-6,789'` into its three parts (sequence/chromosome name, begin position, and end position).
10321043

10331044
=== "SQL"
10341045
``` sql
@@ -1037,9 +1048,9 @@ These SQL functions process a string like "chr1:2,345-6,789" into its three part
10371048
SELECT parse_genomic_range_end('chr1:2,345-6,789', 3) -- 6789
10381049
```
10391050

1040-
<small>
1041-
[The begin position returned is one less than the text number](https://genome.ucsc.edu/FAQ/FAQtracks#tracks1), while the end position is equal to the text number.
1042-
</small>
1051+
❗ Since such text ranges are conventionally one-based and closed, `parse_genomic_range_begin()` effectively converts them to zero-based and half-open by [returning one less than the text begin position](https://genome.ucsc.edu/FAQ/FAQtracks#tracks1).
1052+
1053+
Given NULL, each function returns NULL. An error is raised if the text value can't be parsed, or for any other input type.
10431054

10441055
#### Two-bit encoding for nucleotide sequences
10451056

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This [SQLite3 loadable extension](https://www.sqlite.org/loadext.html) supports
88

99
* genomic range indexing for overlap queries & joins
1010
* streaming storage compression using multithreaded [Zstandard](https://facebook.github.io/zstd/)
11-
* in-SQL utility functions, e.g. parsing "chr1:2,345-6,789"
11+
* in-SQL utility functions, e.g. reverse complement DNA or parse "chr1:2,345-6,789"
1212
* pre-tuned settings for "big data"
1313

1414
This October 2020 poster discusses the context and long-run ambitions:

include/genomicsqlite.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,14 @@ size_t twobit_length(const void *data, size_t sz);
116116
void twobit_dna(const void *data, size_t sz, size_t ofs, size_t len, char *out);
117117
void twobit_rna(const void *data, size_t sz, size_t ofs, size_t len, char *out);
118118

119+
/* Reverse-complement DNA sequence. The out buffer must be preallocated len+1 bytes (a null
120+
* terminator will be affixed).
121+
* Returns:
122+
* 0: success
123+
* -1: encountered non-DNA character
124+
*/
125+
int dna_revcomp(const char *dna, size_t len, char *out);
126+
119127
/*
120128
* C++ bindings: are liable to throw exceptions except where marked noexcept
121129
*/
@@ -173,4 +181,7 @@ GetGenomicReferenceSequencesByRid(sqlite3 *dbconn, const std::string &assembly =
173181
std::map<std::string, gri_refseq_t>
174182
GetGenomicReferenceSequencesByName(sqlite3 *dbconn, const std::string &assembly = "",
175183
const std::string &attached_schema = "");
184+
185+
/* implementation underlying parse_genomic_range_{sequence,begin,end} */
186+
std::tuple<std::string, uint64_t, uint64_t> parse_genomic_range(const std::string &txt);
176187
#endif

src/genomicsqlite.cc

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,7 +1148,7 @@ extern "C" int nucleotides_twobit(const char *seq, size_t len, void *out) {
11481148
return -2;
11491149
}
11501150
assert(c_i >= 0 && c_i < 128);
1151-
const unsigned char crumb = dna_crumb_table[c_i];
1151+
const unsigned char crumb = dna_crumb_table[(unsigned char)c_i];
11521152
if (crumb > 3) {
11531153
return -1;
11541154
}
@@ -1432,6 +1432,68 @@ static void sqlfn_twobit_rna(sqlite3_context *ctx, int argc, sqlite3_value **arg
14321432
twobit_nucleotides(ctx, argc, argv, true);
14331433
}
14341434

1435+
/*
1436+
complement = [0xFF for i in range(256)]
1437+
for l,r in (
1438+
('A','T'), ('C','G'), ('G','C'), ('T','A'),
1439+
('a','t'), ('c','g'), ('g','c'), ('t','a'),
1440+
):
1441+
complement[ord(l)] = r
1442+
1443+
print(complement)
1444+
*/
1445+
const unsigned char dna_complement_table[] = {
1446+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1447+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1448+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1449+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1450+
0xFF, 'T', 0xFF, 'G', 0xFF, 0xFF, 0xFF, 'C', 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1451+
0xFF, 0xFF, 0xFF, 0xFF, 'A', 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1452+
0xFF, 't', 0xFF, 'g', 0xFF, 0xFF, 0xFF, 'c', 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1453+
0xFF, 0xFF, 0xFF, 0xFF, 'a', 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1454+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1455+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1456+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1457+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1458+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1459+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1460+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1461+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
1462+
1463+
extern "C" int dna_revcomp(const char *dna, size_t len, char *out) {
1464+
for (; len; --len, ++out)
1465+
if ((*out = dna_complement_table[(unsigned char)dna[len - 1]]) == 0xFF)
1466+
return -1;
1467+
*out = 0;
1468+
return 0;
1469+
}
1470+
1471+
static void sqlfn_dna_revcomp(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
1472+
assert(argc == 1);
1473+
const char *seq = nullptr;
1474+
ARG_TEXT_OPTIONAL(seq, 0);
1475+
if (!seq) {
1476+
return sqlite3_value_type(argv[0]) == SQLITE_NULL ? sqlite3_result_null(ctx)
1477+
: sqlite3_result_error_nomem(ctx);
1478+
}
1479+
1480+
auto seqlen = sqlite3_value_bytes(argv[0]);
1481+
assert(seqlen >= 0);
1482+
if (seqlen <= 0) {
1483+
return sqlite3_result_value(ctx, argv[0]);
1484+
}
1485+
1486+
try {
1487+
std::unique_ptr<char[]> buf(new char[seqlen + 1]);
1488+
if (dna_revcomp(seq, seqlen, buf.get()) < 0) {
1489+
return sqlite3_result_error(ctx, "non-DNA input to dna_revcomp()", -1);
1490+
}
1491+
return sqlite3_result_text(ctx, buf.get(), seqlen, SQLITE_TRANSIENT);
1492+
} catch (std::bad_alloc &) {
1493+
return sqlite3_result_error_nomem(ctx);
1494+
}
1495+
}
1496+
14351497
/**************************************************************************************************
14361498
* parse_genomic_range_*()
14371499
**************************************************************************************************/
@@ -1458,7 +1520,7 @@ static uint64_t parse_genomic_range_pos(const string &txt, size_t ofs1, size_t o
14581520
return ans;
14591521
}
14601522

1461-
static std::tuple<string, uint64_t, uint64_t> parse_genomic_range(const string &txt) {
1523+
std::tuple<string, uint64_t, uint64_t> parse_genomic_range(const string &txt) {
14621524
auto p1 = txt.find(':');
14631525
auto p2 = txt.find('-');
14641526
if (p1 == string::npos || p2 == string::npos || p1 < 1 || p2 < p1 + 2 || p2 >= txt.size() - 1) {
@@ -1482,10 +1544,14 @@ static std::tuple<string, uint64_t, uint64_t> parse_genomic_range(const string &
14821544

14831545
static void sqlfn_parse_genomic_range_sequence(sqlite3_context *ctx, int argc,
14841546
sqlite3_value **argv) {
1485-
string txt;
1486-
ARG_TEXT(txt, 0);
1547+
const char *txt = nullptr;
1548+
ARG_TEXT_OPTIONAL(txt, 0);
1549+
if (!txt) {
1550+
return sqlite3_value_type(argv[0]) == SQLITE_NULL ? sqlite3_result_null(ctx)
1551+
: sqlite3_result_error_nomem(ctx);
1552+
}
14871553
try {
1488-
auto t = parse_genomic_range(txt);
1554+
auto t = parse_genomic_range(string(txt, sqlite3_value_bytes(argv[0])));
14891555
auto &chrom = get<0>(t);
14901556
return sqlite3_result_text(ctx, chrom.c_str(), chrom.size(), SQLITE_TRANSIENT);
14911557
} catch (std::exception &exn) {
@@ -1494,21 +1560,29 @@ static void sqlfn_parse_genomic_range_sequence(sqlite3_context *ctx, int argc,
14941560
}
14951561

14961562
static void sqlfn_parse_genomic_range_begin(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
1497-
string txt;
1498-
ARG_TEXT(txt, 0);
1563+
const char *txt = nullptr;
1564+
ARG_TEXT_OPTIONAL(txt, 0);
1565+
if (!txt) {
1566+
return sqlite3_value_type(argv[0]) == SQLITE_NULL ? sqlite3_result_null(ctx)
1567+
: sqlite3_result_error_nomem(ctx);
1568+
}
14991569
try {
1500-
auto t = parse_genomic_range(txt);
1570+
auto t = parse_genomic_range(string(txt, sqlite3_value_bytes(argv[0])));
15011571
return sqlite3_result_int64(ctx, get<1>(t));
15021572
} catch (std::exception &exn) {
15031573
sqlite3_result_error(ctx, exn.what(), -1);
15041574
}
15051575
}
15061576

15071577
static void sqlfn_parse_genomic_range_end(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
1508-
string txt;
1509-
ARG_TEXT(txt, 0);
1578+
const char *txt = nullptr;
1579+
ARG_TEXT_OPTIONAL(txt, 0);
1580+
if (!txt) {
1581+
return sqlite3_value_type(argv[0]) == SQLITE_NULL ? sqlite3_result_null(ctx)
1582+
: sqlite3_result_error_nomem(ctx);
1583+
}
15101584
try {
1511-
auto t = parse_genomic_range(txt);
1585+
auto t = parse_genomic_range(string(txt, sqlite3_value_bytes(argv[0])));
15121586
return sqlite3_result_int64(ctx, get<2>(t));
15131587
} catch (std::exception &exn) {
15141588
sqlite3_result_error(ctx, exn.what(), -1);
@@ -1563,6 +1637,7 @@ static int register_genomicsqlite_functions(sqlite3 *db, const char **pzErrMsg,
15631637
{FPNM(twobit_rna), 1, SQLITE_DETERMINISTIC},
15641638
{FPNM(twobit_rna), 2, SQLITE_DETERMINISTIC},
15651639
{FPNM(twobit_rna), 3, SQLITE_DETERMINISTIC},
1640+
{FPNM(dna_revcomp), 1, SQLITE_DETERMINISTIC},
15661641
{FPNM(parse_genomic_range_sequence), 1, SQLITE_DETERMINISTIC},
15671642
{FPNM(parse_genomic_range_begin), 1, SQLITE_DETERMINISTIC},
15681643
{FPNM(parse_genomic_range_end), 1, SQLITE_DETERMINISTIC}};

test/test_parse_genomic_range.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,5 @@ def test_parse_genomic_range():
3838
with pytest.raises(sqlite3.OperationalError):
3939
con.execute("SELECT parse_genomic_range_end(?)", (txt,))
4040
assert "parse_genomic_range():" in str(exc.value)
41+
42+
assert next(con.execute("SELECT parse_genomic_range_end(NULL)"))[0] is None

test/test_twobit.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import random
22
import math
3+
import sqlite3
4+
import pytest
35
import genomicsqlite
46

57

@@ -98,3 +100,21 @@ def test_twobit_column():
98100
("bar",),
99101
("GATTACA",),
100102
]
103+
104+
105+
def test_dna_revcomp():
106+
con = genomicsqlite.connect(":memory:")
107+
108+
assert next(con.execute("SELECT dna_revcomp('AGCTagct')"))[0] == "agctAGCT"
109+
assert next(con.execute("SELECT dna_revcomp('gATtaCa')"))[0] == "tGtaATc"
110+
assert next(con.execute("SELECT dna_revcomp('')"))[0] == ""
111+
assert next(con.execute("SELECT dna_revcomp(NULL)"))[0] is None
112+
113+
with pytest.raises(sqlite3.OperationalError):
114+
con.execute("SELECT dna_revcomp('GATTACAb')")
115+
116+
with pytest.raises(sqlite3.OperationalError):
117+
con.execute("SELECT dna_revcomp('GATTACA ')")
118+
119+
with pytest.raises(sqlite3.OperationalError):
120+
con.execute("SELECT dna_revcomp(42)")

0 commit comments

Comments
 (0)