Skip to content

Commit c717832

Browse files
unamedkrclaude
andauthored
perf: sort vocab before merge parsing + rebuild WASM with ASYNCIFY (#22)
Two changes: 1. Move sorted_indices build before GGUF BPE merge parsing in both tq_tokenizer.c and quant.h. str_lookup() during merge parsing was falling back to O(n) linear scan because sorted_indices wasn't built yet. For Qwen3 (248K vocab × 50K merges × 3 lookups) this was ~10 s of init time. Now uses binary search: ~100 ms. 2. Rebuild quant.js (72K) and quant.wasm (256K) with -sASYNCIFY. The previous binaries were compiled before the ASYNCIFY flags were added to build.sh, so wasm_generate_async() didn't exist and the JS fallback ran the synchronous path (blocking the browser event loop, all tokens appearing at once). The new binary contains asyncify runtime + emscripten_sleep, enabling real-time per-token streaming in the browser demo. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2396027 commit c717832

4 files changed

Lines changed: 23 additions & 23 deletions

File tree

quant.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8033,6 +8033,16 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
80338033
}
80348034
}
80358035

8036+
/* Build sorted indices BEFORE merge parsing so str_lookup() can use
8037+
* binary search instead of O(n) linear scan. For 248K vocab with
8038+
* ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
8039+
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
8040+
if (tok->sorted_indices) {
8041+
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
8042+
g_vocab_for_sort = tok->vocab;
8043+
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
8044+
}
8045+
80368046
/* Load and parse merges if available.
80378047
* GGUF stores merges as a string array of "tok_a tok_b" pairs.
80388048
* We need to look up token IDs and build (id_a, id_b, id_merged) triples
@@ -8072,7 +8082,7 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
80728082
memcpy(merged + la, str_b, (size_t)lb);
80738083
merged[la + lb] = '\0';
80748084

8075-
/* Look up token IDs via linear scan (sorted_indices not built yet) */
8085+
/* Look up token IDs via binary search (sorted_indices built above) */
80768086
int id_a = str_lookup(tok, str_a);
80778087
int id_b = str_lookup(tok, str_b);
80788088
int id_merged = str_lookup(tok, merged);
@@ -8092,16 +8102,6 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
80928102
}
80938103
}
80948104

8095-
/* Build sorted indices for encoding (binary search by string).
8096-
* Use qsort for O(n log n) instead of insertion sort O(n²) — critical
8097-
* for 248K vocab where insertion sort would take minutes. */
8098-
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
8099-
if (tok->sorted_indices) {
8100-
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
8101-
g_vocab_for_sort = tok->vocab;
8102-
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
8103-
}
8104-
81058105
fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
81068106
tok->vocab_size, tok->max_token_len);
81078107
return tok;

src/engine/tq_tokenizer.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -881,6 +881,16 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
881881
}
882882
}
883883

884+
/* Build sorted indices BEFORE merge parsing so str_lookup() can use
885+
* binary search instead of O(n) linear scan. For 248K vocab with
886+
* ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
887+
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
888+
if (tok->sorted_indices) {
889+
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
890+
g_vocab_for_sort = tok->vocab;
891+
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
892+
}
893+
884894
/* Load and parse merges if available.
885895
* GGUF stores merges as a string array of "tok_a tok_b" pairs.
886896
* We need to look up token IDs and build (id_a, id_b, id_merged) triples
@@ -920,7 +930,7 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
920930
memcpy(merged + la, str_b, (size_t)lb);
921931
merged[la + lb] = '\0';
922932

923-
/* Look up token IDs via linear scan (sorted_indices not built yet) */
933+
/* Look up token IDs via binary search (sorted_indices built above) */
924934
int id_a = str_lookup(tok, str_a);
925935
int id_b = str_lookup(tok, str_b);
926936
int id_merged = str_lookup(tok, merged);
@@ -940,16 +950,6 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
940950
}
941951
}
942952

943-
/* Build sorted indices for encoding (binary search by string).
944-
* Use qsort for O(n log n) instead of insertion sort O(n²) — critical
945-
* for 248K vocab where insertion sort would take minutes. */
946-
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
947-
if (tok->sorted_indices) {
948-
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
949-
g_vocab_for_sort = tok->vocab;
950-
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
951-
}
952-
953953
fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
954954
tok->vocab_size, tok->max_token_len);
955955
return tok;

wasm/quant.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wasm/quant.wasm

45.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)