add in-SQL parse_genomic_range() (#11)

mlin · web-flow · commit 1fa5d9831b46 · 2020-12-06T17:11:44.000-10:00
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@ This [SQLite3 loadable extension](https://www.sqlite.org/loadext.html) adds feat
 
 * genomic range indexing for overlap queries & joins
 * streaming storage compression (also available [standalone](https://github.com/mlin/sqlite_zstd_vfs))
+* in-SQL utility functions, e.g. parsing "chr1:2,345-6,789"
 * pre-tuned settings for "big data"
 
 This October 2020 poster discusses the context and long-run ambitions:
diff --git a/docs/guide.md b/docs/guide.md
@@ -1026,14 +1026,25 @@ But this plan strongly depends on the contiguity assumption.
     /* genomicsqlite_open("compressed.db", ...); */
     ```
 
+#### Parse genomic range string
+
+The SQL function `parse_genomic_range(txt, part)` processes a string such as "chr1:2,345-6,789" into any of its three parts (chromosome name, begin position, and end position).
+
+=== "SQL"
+    ``` sql
+    SELECT parse_genomic_range('chr1:2,345-6,789', 1)  -- 'chr1'
+    SELECT parse_genomic_range('chr1:2,345-6,789', 2)  -- 2344
+    SELECT parse_genomic_range('chr1:2,345-6,789', 3)  -- 6789
+    ```
+
+Important: [the begin position returned is one less than the text number](https://genome.ucsc.edu/FAQ/FAQtracks#tracks1), while the end position is equal to the text number.
+
 #### Two-bit encoding for nucleotide sequences
 
 The extension supplies SQL functions to pack a DNA/RNA sequence TEXT value into a smaller BLOB value, using two bits per nucleotide. (Review [SQLite Datatypes](https://www.sqlite.org/datatype3.html) on the important differences between TEXT and BLOB values & columns.) Storing a large database of sequences using such BLOBs instead of TEXT can improve application I/O efficiency, with up to 4X more nucleotides cached in the same memory space. It is not, however, expected to greatly shrink the database file on disk, owing to the automatic storage compression.
 
 The encoding is case-insensitive and considers `T` and `U` equivalent.
 
-**↪ Two-bit encoding**
-
 === "SQL"
     ``` sql
     SELECT nucleotides_twobit('TCAG')
@@ -1043,8 +1054,6 @@ Given a TEXT value consisting of characters from the set `ACGTUacgtu`, compute a
 
 Typically used to populate a BLOB column `C` with `INSERT INTO some_table(...,C) VALUES(...,nucleotides_twobit(?))`. This works even if some of the sequences contain `N`s or other characters, in which case those sequences are stored as the original TEXT values. Make sure the column has schema type `BLOB` to avoid spurious coercions, and by convention, the column should be named *_twobit.
 
-**↪ Two-bit decoding**
-
 === "SQL"
     ``` sql
     SELECT twobit_dna(nucleotides_twobit('TCAG'))
@@ -1076,8 +1085,6 @@ The Genomics Extension bundles the SQLite developers' [JSON1 extension](https://
 
 #### Genomics Extension version
 
-**↪ GenomicSQLite Version**
-
 === "SQL"
     ``` sql
     SELECT genomicsqlite_version()
diff --git a/docs/index.md b/docs/index.md
@@ -8,6 +8,7 @@ This [SQLite3 loadable extension](https://www.sqlite.org/loadext.html) supports
 
 * genomic range indexing for overlap queries & joins
 * streaming storage compression using multithreaded [Zstandard](https://facebook.github.io/zstd/)
+* in-SQL utility functions, e.g. parsing "chr1:2,345-6,789"
 * pre-tuned settings for "big data"
 
 This October 2020 poster discusses the context and long-run ambitions:
diff --git a/src/genomicsqlite.cc b/src/genomicsqlite.cc
@@ -1432,6 +1432,79 @@ static void sqlfn_twobit_rna(sqlite3_context *ctx, int argc, sqlite3_value **arg
     twobit_nucleotides(ctx, argc, argv, true);
 }
 
+/**************************************************************************************************
+ * parse_genomic_range()
+ **************************************************************************************************/
+
+static uint64_t parse_genomic_range_pos(const string &txt, size_t ofs1, size_t ofs2) {
+    assert(ofs1 < ofs2);
+    assert(ofs2 <= txt.size());
+    uint64_t ans = 0;
+    for (size_t i = ofs1; i < ofs2; ++i) {
+        auto c = txt[i];
+        if (c >= '0' && c <= '9') {
+            if (ans > 922337203685477579ULL) { // (2**63-10)//10
+                throw std::runtime_error("parse_genomic_range() position overflow in `" + txt +
+                                         "`");
+            }
+            ans *= 10;
+            ans += c - '0';
+        } else if (c == ',') {
+            continue;
+        } else {
+            throw std::runtime_error("parse_genomic_range() can't read `" + txt + "`");
+        }
+    }
+    return ans;
+}
+
+static std::tuple<string, uint64_t, uint64_t> parse_genomic_range(const string &txt) {
+    auto p1 = txt.find(':');
+    auto p2 = txt.find('-');
+    if (p1 == string::npos || p2 == string::npos || p1 < 1 || p2 < p1 + 2 || p2 >= txt.size() - 1) {
+        throw std::runtime_error("parse_genomic_range(): can't read `" + txt + "`");
+    }
+    string chrom = txt.substr(0, p1);
+    for (size_t i = 0; i < chrom.size(); ++i) {
+        if (std::isspace(chrom[i])) {
+            throw std::runtime_error(
+                "parse_genomic_range(): invalid sequence/chromosome name in `" + txt + "`");
+        }
+    }
+    auto begin_pos = parse_genomic_range_pos(txt, p1 + 1, p2),
+         end_pos = parse_genomic_range_pos(txt, p2 + 1, txt.size());
+    if (begin_pos < 1 || begin_pos > end_pos) {
+        throw std::runtime_error("parse_genomic_range(): invalid one-based positions in `" + txt +
+                                 "`");
+    }
+    return std::make_tuple(chrom, begin_pos - 1, end_pos);
+}
+
+static void sqlfn_parse_genomic_range(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
+    string txt;
+    sqlite3_int64 which_part;
+    ARG_TEXT(txt, 0);
+    ARG(which_part, 1, SQLITE_INTEGER, int64);
+
+    try {
+        auto t = parse_genomic_range(txt);
+        auto &chrom = get<0>(t);
+        switch (which_part) {
+        case 1:
+            return sqlite3_result_text(ctx, chrom.c_str(), chrom.size(), SQLITE_TRANSIENT);
+        case 2:
+            return sqlite3_result_int64(ctx, get<1>(t));
+        case 3:
+            return sqlite3_result_int64(ctx, get<2>(t));
+        default:
+            throw std::runtime_error(
+                "parse_genomic_range(): expected part 1, 2, or 3 (parameter 2)");
+        }
+    } catch (std::exception &exn) {
+        sqlite3_result_error(ctx, exn.what(), -1);
+    }
+}
+
 /**************************************************************************************************
  * SQLite loadable extension initialization
  **************************************************************************************************/
@@ -1479,7 +1552,8 @@ static int register_genomicsqlite_functions(sqlite3 *db, const char **pzErrMsg,
                  {FPNM(twobit_dna), 3, SQLITE_DETERMINISTIC},
                  {FPNM(twobit_rna), 1, SQLITE_DETERMINISTIC},
                  {FPNM(twobit_rna), 2, SQLITE_DETERMINISTIC},
-                 {FPNM(twobit_rna), 3, SQLITE_DETERMINISTIC}};
+                 {FPNM(twobit_rna), 3, SQLITE_DETERMINISTIC},
+                 {FPNM(parse_genomic_range), 2, SQLITE_DETERMINISTIC}};
 
     int rc;
     for (int i = 0; i < sizeof(fntab) / sizeof(fntab[0]); ++i) {
diff --git a/test/test_parse_genomic_range.py b/test/test_parse_genomic_range.py
@@ -0,0 +1,37 @@
+import sqlite3
+import pytest
+import genomicsqlite
+
+
+def test_parse_genomic_range():
+    con = genomicsqlite.connect(":memory:")
+    query = "SELECT parse_genomic_range(?,?)"
+    for (txt, chrom, begin_pos, end_pos) in [
+        ("chr1:2,345-06,789", "chr1", 2344, 6789),
+        ("π:1-9,223,372,036,854,775,799", "π", 0, 9223372036854775799),
+    ]:
+        assert next(con.execute(query, (txt, 1)))[0] == chrom
+        assert next(con.execute(query, (txt, 2)))[0] == begin_pos
+        assert next(con.execute(query, (txt, 3)))[0] == end_pos
+
+    for txt in [
+        "",
+        ":",
+        "-",
+        ":-",
+        ":1-2",
+        "chr1",
+        "chr1:0-1",
+        "chr1:1,234",
+        "chr1:1,234-",
+        "chr1:1,234-5,67",
+        "chr1 :2,345-06,789",
+        "chr1:2,345-06,789\t",
+        "chr1:2345-deadbeef",
+        "chr1:1-9,223,372,036,854,775,800",
+    ]:
+        with pytest.raises(sqlite3.OperationalError):
+            con.execute(query, (txt, 1))
+
+    with pytest.raises(sqlite3.OperationalError):
+        con.execute(query, ("chr1:2-3", 0))