Skip to content

Commit 929f468

Browse files
committed
bundle UINT collating sequence
1 parent adad1df commit 929f468

6 files changed

Lines changed: 133 additions & 8 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ repos:
55
name: clang-format
66
language: system
77
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
8-
exclude: json1\.c
8+
exclude: (json1|uint)\.c
99
verbose: true
1010
entry: clang-format
1111
args: [-i,'-style={IndentWidth: 4, ColumnLimit: 100, BreakStringLiterals: false, ReflowComments: false}']

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ execute_process(COMMAND git describe --tags --always --dirty
3636
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGIT_REVISION=\"\\\"${GIT_REVISION}\\\"\"")
3737

3838
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
39-
add_library(genomicsqlite SHARED src/genomicsqlite.cc include/genomicsqlite.h src/json1.c src/hardcoded_refseq.hpp)
39+
add_library(genomicsqlite SHARED src/genomicsqlite.cc include/genomicsqlite.h src/json1.c src/uint.c src/hardcoded_refseq.hpp)
4040
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
4141
find_library(LIBZSTD_A NAMES libzstd.a REQUIRED)
4242
target_link_libraries(genomicsqlite PRIVATE SQLiteCpp ${LIBZSTD_A})

docs/guide.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ Range positions are considered [**zero-based & half-open**](http://www.cs.utexas
184184
The extension provides routines to populate a small `_gri_refseq` table describing the genomic reference sequences, which other tables can reference by integer ID ("rid") instead of storing a column with textual sequence names like 'chr10'. This convention is not required, as the GRI can index either chromosome name or rid columns, but reasons to observe it include:
185185

186186
* Integers are more compact and faster to look up.
187-
* Results sort properly with `ORDER BY rid` instead of considering e.g. `'chr10'` < `'chr2'` lexicographically.
187+
* Results sort properly with `ORDER BY rid` instead of considering e.g. `'chr10'` < `'chr2'` lexicographically. (See also the UINT collating sequence, below)
188188
* A table with chromosome names can be reconstructed easily by joining with `_gri_refseq`.
189189

190190
### Create GRI
@@ -1102,9 +1102,11 @@ Given a two-bit-encoded BLOB value, return the length of the *decoded* sequence
11021102

11031103
Given a TEXT value, return its byte length. Given NULL, return NULL. Any other input is an error.
11041104

1105-
#### JSON functions
1105+
#### JSON1 and UINT extensions
11061106

1107-
The Genomics Extension bundles the SQLite developers' [JSON1 extension](https://www.sqlite.org/json1.html) and enables it automatically. By convention, JSON object columns should be named *_json and JSON array columns should be named *_jsarray. The JSON1 functions can be used with [generated columns](https://sqlite.org/gencol.html) to effectively allow indexing of JSON-embedded fields.
1107+
The Genomics Extension bundles the SQLite developers' [JSON1 extension](https://www.sqlite.org/json1.html) and enables it automatically. By convention, JSON object columns should be named \*_json and JSON array columns should be named \*_jsarray. The JSON1 functions can be used with [generated columns](https://sqlite.org/gencol.html) to effectively allow indexing of JSON-embedded fields.
1108+
1109+
The [UINT collating sequence](https://www.sqlite.org/uintcseq.html) is also bundled. This can be useful to make e.g. `ORDER BY chromosome COLLATE UINT` put 'chr2' before 'chr10'.
11081110

11091111
#### Genomics Extension version
11101112

src/genomicsqlite.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,7 @@ static void sqlfn_parse_genomic_range_end(sqlite3_context *ctx, int argc, sqlite
15941594
**************************************************************************************************/
15951595

15961596
extern "C" int genomicsqliteJson1Register(sqlite3 *db);
1597+
extern "C" int genomicsqlite_uint_init(sqlite3 *db);
15971598

15981599
static int register_genomicsqlite_functions(sqlite3 *db, const char **pzErrMsg,
15991600
const sqlite3_api_routines *pApi) {
@@ -1671,16 +1672,22 @@ static int register_genomicsqlite_functions(sqlite3 *db, const char **pzErrMsg,
16711672
}
16721673
return rc;
16731674
}
1674-
// genomicsqliteJson1Register() may return SQLITE_BUSY if JSON1 (possibly another version
1675-
// thereof) is already loaded, and the extension is being loaded by SELECT load_extension().
1676-
// That is tolerable.
1675+
// other extensions may return SQLITE_BUSY if another version is already loaded; that is
1676+
// tolerable.
16771677
rc = genomicsqliteJson1Register(db);
16781678
if (rc != SQLITE_OK && rc != SQLITE_BUSY) {
16791679
if (pzErrMsg) {
16801680
*pzErrMsg = sqlite3_mprintf("Genomics Extension failed to register JSON1");
16811681
}
16821682
return rc;
16831683
}
1684+
rc = genomicsqlite_uint_init(db);
1685+
if (rc != SQLITE_OK && rc != SQLITE_BUSY) {
1686+
if (pzErrMsg) {
1687+
*pzErrMsg = sqlite3_mprintf("Genomics Extension failed to register UINT collation");
1688+
}
1689+
return rc;
1690+
}
16841691
return SQLITE_OK;
16851692
}
16861693

src/uint.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
** GenomicSQLite bundles the UINT extension: https://www.sqlite.org/uintcseq.html
3+
**
4+
** This is the (public domain) upstream source file from:
5+
** https://sqlite.org/src/file/ext/misc/uint.c
6+
** with minor edits deferring to genomicsqlite.cc:sqlite3_genomicsqlite_init for extension init
7+
*/
8+
9+
/*
10+
** 2020-04-14
11+
**
12+
** The author disclaims copyright to this source code. In place of
13+
** a legal notice, here is a blessing:
14+
**
15+
** May you do good and not evil.
16+
** May you find forgiveness for yourself and forgive others.
17+
** May you share freely, never taking more than you give.
18+
**
19+
******************************************************************************
20+
**
21+
** This SQLite extension implements the UINT collating sequence.
22+
**
23+
** UINT works like BINARY for text, except that embedded strings
24+
** of digits compare in numeric order.
25+
**
26+
** * Leading zeros are handled properly, in the sense that
27+
** they do not mess of the maginitude comparison of embedded
28+
** strings of digits. "x00123y" is equal to "x123y".
29+
**
30+
** * Only unsigned integers are recognized. Plus and minus
31+
** signs are ignored. Decimal points and exponential notation
32+
** are ignored.
33+
**
34+
** * Embedded integers can be of arbitrary length. Comparison
35+
** is *not* limited integers that can be expressed as a
36+
** 64-bit machine integer.
37+
*/
38+
#include "sqlite3ext.h"
39+
/* instead of SQLITE_EXTENSION_INIT1, link to the one in genomicsqlite.cc: */
40+
extern const sqlite3_api_routines *sqlite3_api;
41+
#include <assert.h>
42+
#include <string.h>
43+
#include <ctype.h>
44+
45+
/*
46+
** Compare text in lexicographic order, except strings of digits
47+
** compare in numeric order.
48+
*/
49+
static int uintCollFunc(
50+
void *notUsed,
51+
int nKey1, const void *pKey1,
52+
int nKey2, const void *pKey2
53+
){
54+
const unsigned char *zA = (const unsigned char*)pKey1;
55+
const unsigned char *zB = (const unsigned char*)pKey2;
56+
int i=0, j=0, x;
57+
(void)notUsed;
58+
while( i<nKey1 && j<nKey2 ){
59+
x = zA[i] - zB[j];
60+
if( isdigit(zA[i]) ){
61+
int k;
62+
if( !isdigit(zB[j]) ) return x;
63+
while( i<nKey1 && zA[i]=='0' ){ i++; }
64+
while( j<nKey2 && zB[j]=='0' ){ j++; }
65+
k = 0;
66+
while( i+k<nKey1 && isdigit(zA[i+k])
67+
&& j+k<nKey2 && isdigit(zB[j+k]) ){
68+
k++;
69+
}
70+
if( i+k<nKey1 && isdigit(zA[i+k]) ){
71+
return +1;
72+
}else if( j+k<nKey2 && isdigit(zB[j+k]) ){
73+
return -1;
74+
}else{
75+
x = memcmp(zA+i, zB+j, k);
76+
if( x ) return x;
77+
i += k;
78+
j += k;
79+
}
80+
}else if( x ){
81+
return x;
82+
}else{
83+
i++;
84+
j++;
85+
}
86+
}
87+
return (nKey1 - i) - (nKey2 - j);
88+
}
89+
90+
#ifdef _WIN32
91+
__declspec(dllexport)
92+
#endif
93+
int genomicsqlite_uint_init(
94+
sqlite3 *db
95+
){
96+
return sqlite3_create_collation(db, "uint", SQLITE_UTF8, 0, uintCollFunc);
97+
}

test/test_ext.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import genomicsqlite
2+
3+
4+
def test_bundled_extensions():
5+
con = genomicsqlite.connect(":memory:")
6+
7+
con.executescript(
8+
"""
9+
CREATE TABLE test(json TEXT);
10+
INSERT INTO test(json) VALUES('{"chrom":"chr10"}');
11+
INSERT INTO test(json) VALUES('{"chrom":"chr2"}');
12+
INSERT INTO test(json) VALUES('{"chrom":"chrMT"}');
13+
"""
14+
)
15+
16+
results = con.execute(
17+
"SELECT json_extract(json,'$.chrom') AS chrom FROM test ORDER BY chrom COLLATE UINT"
18+
)
19+
assert list(results) == [("chr2",), ("chr10",), ("chrMT",)]

0 commit comments

Comments
 (0)