Skip to content

Commit 993e597

Browse files
committed
twobit: don't inflate single-nucleotide inputs
1 parent 76c7a04 commit 993e597

4 files changed

Lines changed: 58 additions & 30 deletions

File tree

docs/guide.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,9 +1037,9 @@ Storing a large database of sequences using such BLOBs instead of TEXT can impro
10371037
SELECT nucleotides_twobit('TCAG')
10381038
```
10391039

1040-
Given any TEXT value matching `[AaCcGgTtUu]+`, compute a two-bit-encoded BLOB value that can later be decoded using `twobit_dna()` or `twobit_rna()`. The two-bit encoding is case-insensitive and considers `T` and `U` equivalent.
1040+
Given a TEXT value consisting only of characters from `ACGTUacgtu`, compute a two-bit-encoded BLOB value that can later be decoded using `twobit_dna()` or `twobit_rna()`. Given any other ASCII TEXT value, pass it through unchanged. The encoding is case-insensitive and considers `T` and `U` equivalent.
10411041

1042-
Given any other ASCII TEXT value, including the empty string, pass it through unchanged. Given a BLOB, first attempt to coerce it to ASCII TEXT. Given NULL, return NULL. Any other input is an error.
1042+
Given a BLOB, first attempt to coerce it to ASCII TEXT. Given NULL, return NULL. Any other input is an error.
10431043

10441044
**↪ Two-bit decoding**
10451045

@@ -1051,7 +1051,7 @@ Given any other ASCII TEXT value, including the empty string, pass it through un
10511051
SELECT twobit_rna(nucleotides_twobit('UCAG'),Y,Z)
10521052
```
10531053

1054-
Given a BLOB value, perform two-bit decoding to produce a nucleotide sequence as uppercased TEXT, with `T`'s for `twobit_dna()` and `U`'s for `twobit_rna()`. Take care to only use BLOBs originally produced by the two-bit encoder, as any BLOB *will* decode to some nucleotide sequence.
1054+
Given a two-bit-encoded BLOB value, decode the nucleotide sequence as uppercased TEXT, with `T`'s for `twobit_dna()` and `U`'s for `twobit_rna()`. Take care to only use BLOBs originally produced by `nucleotides_twobit()`, as other BLOBs may decode to garbage nucleotide sequences.
10551055

10561056
Given a TEXT value, pass it through unchanged. Given NULL, return NULL. Any other first input is an error.
10571057

include/genomicsqlite.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,12 @@ char *put_genomic_reference_sequence_sql(const char *name, sqlite3_int64 length,
9393
*/
9494

9595
/*
96-
* Two-bit encode the nucleotide character sequence of specified length. The output buffer must be
97-
* preallocated (len+7)/4 bytes. Return codes:
98-
* 0 = success; wrote (len+7)/4 bytes
99-
* -1 = encountered non-nucleotide ASCII character
100-
* -2 = encountered non-ASCII character (e.g. UTF-8) or NUL
96+
* Two-bit encode the nucleotide character sequence of specified length. The output byte count is
97+
* a function of len as follows: len==0 => 0, len==1 => 1, else => (len+7)/4.
98+
* Returns:
99+
* n >= 0: success; wrote n bytes
100+
* -1: encountered non-nucleotide ASCII character
101+
* -2: encountered non-ASCII character (e.g. UTF-8) or NUL
101102
*/
102103
int nucleotides_twobit(const char *seq, size_t len, void *out);
103104

@@ -107,13 +108,13 @@ int nucleotides_twobit(const char *seq, size_t len, void *out);
107108
size_t twobit_length(const void *data, size_t sz);
108109

109110
/*
110-
* Given blob pointer, two-bit-decode the nucleotide subsequence [ofs, ofs+len). To get the whole
111+
* Given two-bit-encoded blob, decode the nucleotide subsequence [ofs, ofs+len). To get the whole
111112
* sequence, set ofs=0 & len=twobit_length(data, datasize). Caller must ensure that:
112113
* 1. ofs+len <= twobit_length(data, datasize)
113114
* 2. out is preallocated len+1 bytes (a null terminator will be affixed)
114115
*/
115-
void twobit_dna(const void *data, size_t ofs, size_t len, char *out);
116-
void twobit_rna(const void *data, size_t ofs, size_t len, char *out);
116+
void twobit_dna(const void *data, size_t sz, size_t ofs, size_t len, char *out);
117+
void twobit_rna(const void *data, size_t sz, size_t ofs, size_t len, char *out);
117118

118119
/*
119120
* C++ bindings: are liable to throw exceptions except where marked noexcept

src/genomicsqlite.cc

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,13 +1126,20 @@ const unsigned char dna_crumb_table[] = {
11261126
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
11271127

11281128
extern "C" int nucleotides_twobit(const char *seq, size_t len, void *out) {
1129+
if (len == 0) {
1130+
return 0;
1131+
}
1132+
11291133
unsigned char *outbyte = (unsigned char *)out;
11301134

1131-
// header byte: the low two bits specify how many crumbs at the end of the buffer must be
1132-
// ignored by the decoder (0, 1, 2, or 3)
1135+
// Header byte: the low two bits specify how many crumbs at the end of the buffer must be
1136+
// ignored by the decoder (0, 1, 2, or 3). Exception: if len == 1 then these low two bits
1137+
// encode the nucleotide directly.
11331138
auto trailing_crumbs = (4 - len % 4) % 4;
11341139
assert(trailing_crumbs >= 0 && trailing_crumbs <= 3);
1135-
*(outbyte++) = trailing_crumbs;
1140+
if (len > 1) {
1141+
*(outbyte++) = trailing_crumbs;
1142+
}
11361143

11371144
unsigned char byte = 0;
11381145
for (size_t i = 0; i < len; ++i) {
@@ -1155,13 +1162,15 @@ extern "C" int nucleotides_twobit(const char *seq, size_t len, void *out) {
11551162

11561163
if (trailing_crumbs) {
11571164
assert(len && (byte >> (2 * (4 - trailing_crumbs))) == 0);
1158-
byte <<= (2 * trailing_crumbs);
1159-
*outbyte = byte;
1165+
if (len > 1) {
1166+
byte <<= (2 * trailing_crumbs);
1167+
}
1168+
*(outbyte++) = byte;
11601169
} else {
11611170
assert(byte == 0);
11621171
}
11631172

1164-
return 0;
1173+
return (outbyte - (unsigned char *)out);
11651174
}
11661175

11671176
static void sqlfn_nucleotides_twobit(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
@@ -1179,7 +1188,7 @@ static void sqlfn_nucleotides_twobit(sqlite3_context *ctx, int argc, sqlite3_val
11791188

11801189
auto seqlen = sqlite3_value_bytes(argv[0]);
11811190
assert(seqlen >= 0);
1182-
if (!seqlen) {
1191+
if (seqlen <= 0) {
11831192
return sqlite3_result_value(ctx, argv[0]);
11841193
}
11851194

@@ -1190,23 +1199,22 @@ static void sqlfn_nucleotides_twobit(sqlite3_context *ctx, int argc, sqlite3_val
11901199
}
11911200

11921201
try {
1193-
size_t bufsz = (seqlen + 7) / 4;
1194-
std::unique_ptr<unsigned char[]> buf(new unsigned char[bufsz]);
1202+
std::unique_ptr<unsigned char[]> buf(new unsigned char[(seqlen + 7) / 4]);
11951203
int rc = nucleotides_twobit(seq, (size_t)seqlen, buf.get());
11961204
if (rc == -2) {
11971205
return sqlite3_result_error(ctx, "non-ASCII input to nucleotides_twobit()", -1);
1198-
} else if (rc != 0) {
1206+
} else if (rc < 0) {
11991207
return sqlite3_result_text(ctx, seq, seqlen, SQLITE_TRANSIENT);
12001208
}
1201-
return sqlite3_result_blob64(ctx, buf.get(), bufsz, SQLITE_TRANSIENT);
1209+
return sqlite3_result_blob64(ctx, buf.get(), rc, SQLITE_TRANSIENT);
12021210
} catch (std::bad_alloc &) {
12031211
return sqlite3_result_error_nomem(ctx);
12041212
}
12051213
}
12061214

12071215
extern "C" size_t twobit_length(const void *data, size_t sz) {
12081216
if (sz < 2) {
1209-
return 0;
1217+
return sz;
12101218
}
12111219
const unsigned char *bytes = (const unsigned char *)data;
12121220
unsigned char trailing_crumbs = bytes[0] & 0b11;
@@ -1295,8 +1303,20 @@ const char *twobit_rna4mers[] = {
12951303
"GGUU", "GGUC", "GGUA", "GGUG", "GGCU", "GGCC", "GGCA", "GGCG", "GGAU", "GGAC", "GGAA", "GGAG",
12961304
"GGGU", "GGGC", "GGGA", "GGGG"};
12971305

1298-
static void twobit_nucleotides(const void *data, size_t ofs, size_t len, bool rna, char *out) {
1306+
static void twobit_nucleotides(const void *data, size_t sz, size_t ofs, size_t len, bool rna,
1307+
char *out) {
12991308
const char **table = rna ? twobit_rna4mers : twobit_dna4mers;
1309+
// special cases for length-0 and 1 blobs
1310+
if (sz < 2) {
1311+
if (len == 0) {
1312+
out[0] = 0;
1313+
return;
1314+
}
1315+
assert(ofs == 0 && len == 1 && sz == 1);
1316+
out[0] = table[*(const unsigned char *)data & 0b11][3];
1317+
out[1] = 0;
1318+
return;
1319+
}
13001320
const unsigned char *pbyte = ((const unsigned char *)data) + 1 + ofs / 4;
13011321
size_t out_cursor = 0;
13021322
// decode first payload byte (maybe only part of it) crumb-by-crumb
@@ -1313,15 +1333,16 @@ static void twobit_nucleotides(const void *data, size_t ofs, size_t len, bool rn
13131333
out[out_cursor++] = table[*pbyte][crumb++];
13141334
}
13151335
assert(out_cursor == len);
1336+
assert(pbyte - (const unsigned char *)data <= sz);
13161337
out[out_cursor] = 0;
13171338
}
13181339

1319-
extern "C" void twobit_dna(const void *data, size_t ofs, size_t len, char *out) {
1320-
return twobit_nucleotides(data, ofs, len, false, out);
1340+
extern "C" void twobit_dna(const void *data, size_t sz, size_t ofs, size_t len, char *out) {
1341+
return twobit_nucleotides(data, sz, ofs, len, false, out);
13211342
}
13221343

1323-
extern "C" void twobit_rna(const void *data, size_t ofs, size_t len, char *out) {
1324-
return twobit_nucleotides(data, ofs, len, true, out);
1344+
extern "C" void twobit_rna(const void *data, size_t sz, size_t ofs, size_t len, char *out) {
1345+
return twobit_nucleotides(data, sz, ofs, len, true, out);
13251346
}
13261347

13271348
static void twobit_nucleotides(sqlite3_context *ctx, int argc, sqlite3_value **argv, bool rna) {
@@ -1385,7 +1406,7 @@ static void twobit_nucleotides(sqlite3_context *ctx, int argc, sqlite3_value **a
13851406
if (blob) {
13861407
// decode two-bit-encoded BLOB
13871408
unique_ptr<char[]> buf(new char[sub_len + 1]);
1388-
twobit_nucleotides(sqlite3_value_blob(argv[0]), sub_ofs, sub_len, rna, buf.get());
1409+
twobit_nucleotides(sqlite3_value_blob(argv[0]), sz, sub_ofs, sub_len, rna, buf.get());
13891410
sqlite3_result_text(ctx, buf.get(), sub_len, SQLITE_TRANSIENT);
13901411
} else if (sub_ofs == 0 && sub_len == len) {
13911412
// pass through complete text

test/test_twobit.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def test_twobit():
77
con = genomicsqlite.connect(":memory:")
88

99
random.seed(42)
10-
for seqlen in (random.randint(1, 1250) for _ in range(5000)):
10+
for seqlen in (random.randint(2, 1250) for _ in range(5000)):
1111
rna = random.choice((False, True))
1212

1313
nucs = (
@@ -51,6 +51,12 @@ def test_twobit():
5151
)[0]
5252
assert decoded == control
5353

54+
for nuc in "AGCTagct":
55+
assert next(con.execute("SELECT length(nucleotides_twobit(?))", (nuc,)))[0] == 1
56+
assert (
57+
next(con.execute("SELECT twobit_dna(nucleotides_twobit(?))", (nuc,)))[0] == nuc.upper()
58+
)
59+
assert next(con.execute("SELECT nucleotides_twobit('')"))[0] == ""
5460
assert next(con.execute("SELECT nucleotides_twobit('acgt 1')"))[0] == "acgt 1"
5561
assert next(con.execute("SELECT twobit_dna('acgt 1')"))[0] == "acgt 1"
5662
assert next(con.execute("SELECT twobit_dna('acgt 1',1,6)"))[0] == "acgt 1"

0 commit comments

Comments
 (0)