Skip to content

Commit 74da991

Browse files
authored
support .dbi helper files to speed up web access (#19)
1 parent 800a996 commit 74da991

8 files changed

Lines changed: 91 additions & 25 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ endif()
1818
FetchContent_Declare(
1919
sqlite_zstd_vfs
2020
GIT_REPOSITORY https://github.com/mlin/sqlite_zstd_vfs.git
21-
GIT_TAG e81ef25
21+
GIT_TAG eadb758
2222
)
2323
FetchContent_MakeAvailable(sqlite_zstd_vfs)
2424
FetchContent_MakeAvailable(sqlitecpp)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ This October 2020 poster discusses the context and long-run ambitions:
1616

1717
Our **[Colab notebook](https://colab.research.google.com/drive/1OlHPOcRQBhDmEnS1wtOdtUGDkcD7LtKx?usp=sharing)** demonstrates key features with Python, one of several language bindings.
1818

19-
**USE AT YOUR OWN RISK:** The extension makes fundamental changes to the database storage layer. While designed to preserve ACID transaction safety, it's young and unlikely to have zero bugs. This project is not associated with the SQLite developers.
19+
**USE AT YOUR OWN RISK:** This project is not associated with the SQLite developers. The database storage extensions are designed to preserve ACID transaction safety, but they're young and unlikely to be totally bug-free.
2020

2121
## [Installation & Programming Guide](https://mlin.github.io/GenomicSQLite/)
2222

bindings/python/genomicsqlite/__init__.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -202,13 +202,20 @@ def get_reference_sequences_by_name(
202202
203203
sqlite3_ARG: passed through to sqlite3 (see `sqlite3 -help`)
204204
205+
--
206+
207+
Other tools:
208+
209+
genomicsqlite DB_FILENAME --compact [options|--help]
210+
[Re]compress and defragment an existing SQLite3 or GenomicSQLite database. See `genomicsqlite --compact --help`
211+
212+
genomicsqlite DB_FILENAME --dbi [options|--help]
213+
Generate a .dbi helper file to optimize web access for a given database. See `genomicsqlite --dbi --help`
205214
206-
Usage: genomicsqlite DB_FILENAME --compact [options|--help]
207-
[Re]compress and defragment an existing SQLite3 or GenomicSQLite database. See --compact --help for options
208215
"""
209216

210217

211-
def _cli():
218+
def _cli(argv=None):
212219
"""
213220
Command-line entry point wrapping the `sqlite3` interactive CLI to open a GenomicSQLite
214221
compressed database file.
@@ -217,17 +224,22 @@ def _cli():
217224
language bindings.
218225
"""
219226

220-
if "--compact" in sys.argv or "-compact" in sys.argv:
221-
_compact(sys.argv[1], sys.argv[2:])
227+
if argv is None:
228+
argv = sys.argv
229+
if "--compact" in argv or "-compact" in argv:
230+
_compact(argv[1], argv[2:])
231+
return
232+
if "--dbi" in argv or "-dbi" in argv:
233+
_dbi(argv[1], argv[2:])
222234
return
223235

224236
if (
225-
len(sys.argv) < 2
237+
len(argv) < 2
226238
or not (
227-
next((True for pfx in ("http:", "https:") if sys.argv[1].startswith(pfx)), False)
228-
or os.path.isfile(sys.argv[1])
239+
next((True for pfx in ("http:", "https:") if argv[1].startswith(pfx)), False)
240+
or os.path.isfile(argv[1])
229241
)
230-
or next((True for a in ("-h", "-help", "--help") if a in sys.argv), False)
242+
or next((True for a in ("-h", "-help", "--help") if a in argv), False)
231243
):
232244
print(
233245
"Usage: genomicsqlite DB_FILENAME [--readonly] [sqlite3_ARG ...]\n",
@@ -250,13 +262,13 @@ def _cli():
250262

251263
cfg = {}
252264
try:
253-
sys.argv[sys.argv.index("--readonly")] = "-readonly"
265+
argv[argv.index("--readonly")] = "-readonly"
254266
except ValueError:
255267
pass
256-
if "-readonly" in sys.argv:
268+
if "-readonly" in argv:
257269
cfg["mode"] = "ro"
258270
cfg = json.dumps(cfg)
259-
uri = _execute1(_MEMCONN, "SELECT genomicsqlite_uri(?,?)", (sys.argv[1], cfg))
271+
uri = _execute1(_MEMCONN, "SELECT genomicsqlite_uri(?,?)", (argv[1], cfg))
260272
tuning_sql = _execute1(_MEMCONN, "SELECT genomicsqlite_tuning_sql(?)", (cfg,))
261273

262274
cmd = [
@@ -274,7 +286,7 @@ def _cli():
274286
'.prompt "GenomicSQLite> "',
275287
]
276288
if sys.stdout.isatty() and not next(
277-
(arg for arg in sys.argv[2:] if not arg.startswith("-")), False
289+
(arg for arg in argv[2:] if not arg.startswith("-")), False
278290
):
279291
# interactive mode:
280292
cmd += [
@@ -284,7 +296,7 @@ def _cli():
284296
".headers on",
285297
]
286298
cmd.append(":memory:") # placeholder so remaining positional args are recognized as such
287-
cmd += sys.argv[2:]
299+
cmd += argv[2:]
288300

289301
if "DEBUG" in os.environ:
290302
print(
@@ -422,5 +434,12 @@ def _compact(dbfilename, argv):
422434
print("OK", file=sys.stderr)
423435

424436

437+
def _dbi(dbfilename, argv):
438+
from .sqlite_web_dbi import main as dbi_main # pylint: disable=E0401,E0611
439+
440+
argv = [elt for elt in argv if elt not in ("-dbi", "--dbi")]
441+
dbi_main(["genomicsqlite DBFILE --dbi", dbfilename] + argv)
442+
443+
425444
if __name__ == "__main__":
426-
_cli()
445+
_cli(sys.argv)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import sys
2+
from . import _cli
3+
4+
if __name__ == "__main__":
5+
_cli(sys.argv)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../build/_deps/sqlite_web_vfs-src/sqlite_web_dbi.py

docs/guide_db.md

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,24 @@ Due to decompression overhead, the compaction procedure may be impractically slo
200200

201201
The **GenomicSQLite Open** routine and the `genomicsqlite` shell also accept http: and https: URLs instead of local filenames, creating a connection to read the compressed file over the web directly. The database connection must be opened read-only in the appropriate manner for your language bindings (such as the flag `SQLITE_OPEN_READONLY`). The URL server must support [HTTP GET range](https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests) requests, and the content must not change for the lifetime of the connection.
202202

203-
Under the hood, the extension uses [libcurl](https://curl.se/libcurl/) to send web requests for necessary portions of the database file as queries proceed, with adaptive batching & prefetching to balance the number and size of these requests. This works well for point lookups and queries that scan largely-contiguous slices of tables and indexes (a modest number thereof). It's less suitable for big multi-way joins and other aggressively random access patterns; in such cases, it'd be better to download the database file upfront to open locally.
203+
Under the hood, the extension uses [libcurl](https://curl.se/libcurl/) to send web requests for necessary portions of the database file as queries proceed, with adaptive batching & prefetching to balance the number and size of these requests. This works well for point lookups and queries that scan largely-contiguous slices of tables and indexes (and a modest number thereof). It's less suitable for big multi-way joins and other aggressively random access patterns; in such cases, it'd be better to download the database file upfront to open locally.
204204

205-
* The above-described `genomicsqlite DB_FILENAME --compact` tool can optimize a file's suitability for web access.
206205
* Reading large databases over the web, budget an additional ~600MiB of memory for HTTP prefetch buffers.
207-
* To disable TLS certificate and hostname verification, set web_insecure = true in the GenomicSQLite configuration, or SQLITE_WEB_INSECURE=1 in the environment.
208206
* The HTTP driver writes log messages to standard error when requests fail or had to be retried, which can be disabled by setting configuration web_log = 0 or environment SQLITE_WEB_LOG=0; or increased up to 5 to log every request and other details.
207+
* To disable TLS certificate and hostname verification, set web_insecure = true in the GenomicSQLite configuration, or SQLITE_WEB_INSECURE=1 in the environment.
208+
* The above-described `genomicsqlite DB_FILENAME --compact` optimizes a database for web access by making the request pattern more contiguous.
209+
210+
### Web access optimization with .dbi helper files
211+
212+
*Experimental feature*
213+
214+
Optionally, web access can be further optimized by a small .dbi helper file served alongside the main database file. The client automatically probes for this by appending `.dbi` to the database URL (unless there's a query string). If that's not usable for any reason, the database falls back to direct access. Increase the web_log to 3 or higher to see which mode is used.
215+
216+
Use `genomicsqlite DB_FILENAME --dbi` to generate the .dbi helper for an immutable database file, then publish them alongside each other. The .dbi must be regenerated if the database subsequently changes.
217+
218+
To override the automatic probe, set configuration web_dbi_url to a different URL for the .dbi file, or to a local `file:/path/to.dbi` downloaded beforehand. Use the latter feature to save multiple connections from each having to fetch the .dbi separately. Lastly, set web_nodbi to true or environment SQLITE_WEB_NODBI=1 to disable dbi mode entirely.
219+
220+
The .dbi helper is optional, but often beneficial for big databases accessed with high-latency requests. It collects bits of the main file that are key for navigating it, but typically scattered throughout (even after compaction). Prefetching them in the compact .dbi saves the reader from having to pluck them from all over the main file.
209221

210222
## Advice for big data
211223

src/genomicsqlite.cc

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ std::string GenomicSQLiteDefaultConfigJSON() {
6969
"inner_page_KiB": 16,
7070
"outer_page_KiB": 32,
7171
"web_log": 2,
72-
"web_insecure": false
72+
"web_insecure": false,
73+
"web_dbi_url": "",
74+
"web_nodbi": false
7375
})";
7476
}
7577

@@ -172,8 +174,18 @@ string GenomicSQLiteURI(const string &dbfile, const string &config_json = "") {
172174
uri << "file:" << (web ? "/__web__" : SQLiteNested::urlencode(dbfile, true)) << "?vfs=zstd";
173175
if (web) {
174176
uri << "&mode=ro&immutable=1&web_url=" << SQLiteNested::urlencode(dbfile)
175-
<< "&web_log=" << cfg.GetInt("$.web_log")
176-
<< "&web_insecure=" << (cfg.GetBool("$.web_insecure") ? 1 : 0);
177+
<< "&web_log=" << cfg.GetInt("$.web_log");
178+
if (cfg.GetBool("$.web_insecure")) {
179+
uri << "&web_insecure=1";
180+
}
181+
if (cfg.GetBool("$.web_nodbi")) {
182+
uri << "&web_nodbi=1";
183+
} else {
184+
auto web_dbi_url = cfg.GetString("$.web_dbi_url");
185+
if (!web_dbi_url.empty()) {
186+
uri << "&web_dbi_url=" << SQLiteNested::urlencode(web_dbi_url);
187+
}
188+
}
177189
}
178190
int threads = cfg.GetInt("$.threads");
179191
uri << "&outer_cache_size=-65536"

test/genomicsqlite_big_tests.wdl

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,21 @@ workflow genomicsqlite_big_tests {
2323
sam_into_sqlite = build.sam_into_sqlite
2424
}
2525
26-
call test_sam_web {
26+
call test_sam_web as test_sam_web_nodbi {
2727
input:
2828
reads_db = test_sam.reads_db,
2929
libgenomicsqlite_so = select_first([libgenomicsqlite_so, build.libgenomicsqlite_so]),
3030
genomicsqlite_py = build.genomicsqlite_py
3131
}
3232
33+
call test_sam_web as test_sam_web_dbi {
34+
input:
35+
reads_db = test_sam.reads_db,
36+
libgenomicsqlite_so = select_first([libgenomicsqlite_so, build.libgenomicsqlite_so]),
37+
genomicsqlite_py = build.genomicsqlite_py,
38+
sqlite_web_dbi_py = build.sqlite_web_dbi_py
39+
}
40+
3341
call test_vcf {
3442
input:
3543
variants = variants,
@@ -65,6 +73,7 @@ task build {
6573
output {
6674
File libgenomicsqlite_so = "GenomicSQLite/build/libgenomicsqlite.so"
6775
File genomicsqlite_py = "GenomicSQLite/genomicsqlite.py"
76+
File sqlite_web_dbi_py = "GenomicSQLite/build/_deps/sqlite_web_vfs-src/sqlite_web_dbi.py"
6877
File sam_into_sqlite = "GenomicSQLite/build/loaders/sam_into_sqlite"
6978
File vcf_into_sqlite = "GenomicSQLite/build/loaders/vcf_into_sqlite"
7079
}
@@ -172,6 +181,7 @@ task test_sam_web {
172181
File reads_db
173182
File genomicsqlite_py
174183
File libgenomicsqlite_so
184+
File? sqlite_web_dbi_py
175185
}
176186

177187
command <<<
@@ -183,6 +193,14 @@ task test_sam_web {
183193
cp ~{libgenomicsqlite_so} /usr/local/lib/libgenomicsqlite.so
184194
ldconfig
185195
196+
READS_DB_DIR="$(dirname '~{reads_db}')"
197+
if [ -n '~{sqlite_web_dbi_py}' ]; then
198+
# generate .dbi to serve alongside database
199+
cp '~{sqlite_web_dbi_py}' /usr/local/bin/sqlite_web_dbi.py
200+
chmod +x /usr/local/bin/sqlite_web_dbi.py
201+
sqlite_web_dbi.py '~{reads_db}'
202+
fi
203+
186204
# make self-signed cert
187205
openssl req -new -newkey rsa:4096 -days 365 -nodes -x509 \
188206
-subj "/C=US/ST=Denial/L=Springfield/O=Dis/CN=www.example.com" \
@@ -191,7 +209,6 @@ task test_sam_web {
191209
# write nginx.config as heredoc
192210
# references: https://docs.nginx.com/nginx/admin-guide/web-server/serving-static-content/
193211
# http://nginx.org/en/docs/http/configuring_https_servers.html
194-
READS_DB_DIR="$(dirname '~{reads_db}')"
195212
cat << EOF > nginx.config
196213
worker_processes 8;
197214
error_log stderr warn;

0 commit comments

Comments
 (0)