Skip to content

Commit 2396027

Browse files
unamedkrclaude
andauthored
Fix GGUF BPE merge parsing — fixes Qwen3/Llama3 garbage output (#21)
tq_load_tokenizer_from_gguf() allocated the merge_pairs buffer and set n_merges, but never actually parsed the GGUF merge strings into (id_a, id_b, id_merged) triples. The buffer was zeroed and left unpopulated. BPE tokenizers (Qwen3 248K vocab, Llama 3, GPT-2 style) depend on merge pairs to combine byte tokens into word tokens. Without parsed merges, every byte was emitted as a separate token, producing garbage Unicode output. SentencePiece tokenizers (SmolLM2, Gemma) worked because they use character-level encoding and don't need BPE merges. The fix iterates over the GGUF string array, splits each "tok_a tok_b" merge rule, looks up token IDs, and stores the triple — identical to the existing JSON tokenizer path (tq_tokenizer.c:596-672). Applied to both tq_tokenizer.c (library) and quant.h (single-header / WASM). Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 10c49ff commit 2396027

2 files changed

Lines changed: 98 additions & 12 deletions

File tree

quant.h

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8033,18 +8033,61 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
80338033
}
80348034
}
80358035

8036-
/* Load merges if available */
8036+
/* Load and parse merges if available.
8037+
* GGUF stores merges as a string array of "tok_a tok_b" pairs.
8038+
* We need to look up token IDs and build (id_a, id_b, id_merged) triples
8039+
* so the BPE encoder can use them. */
80378040
int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
80388041
if (merges_idx >= 0) {
80398042
const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
80408043
if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
80418044
mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
8042-
/* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
8043-
uint64_t n_merges = mkv->value.array.count;
8044-
tok->n_merges = (int)n_merges;
8045-
tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
8045+
uint64_t n_merges_total = mkv->value.array.count;
8046+
tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
8047+
tok->n_merges = 0;
80468048
if (tok->merge_pairs) {
8047-
memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
8049+
tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
8050+
for (uint64_t mi = 0; mi < n_merges_total; mi++) {
8051+
if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
8052+
8053+
/* Copy merge string and split on space: "tok_a tok_b" */
8054+
char buf[2048];
8055+
int slen = (int)merge_strings[mi].len;
8056+
if (slen >= (int)sizeof(buf)) continue;
8057+
memcpy(buf, merge_strings[mi].str, (size_t)slen);
8058+
buf[slen] = '\0';
8059+
8060+
char* sep = strchr(buf, ' ');
8061+
if (!sep) continue;
8062+
*sep = '\0';
8063+
const char* str_a = buf;
8064+
const char* str_b = sep + 1;
8065+
8066+
/* Build merged string: concatenation of tok_a + tok_b */
8067+
char merged[2048];
8068+
int la = (int)strlen(str_a);
8069+
int lb = (int)strlen(str_b);
8070+
if (la + lb >= (int)sizeof(merged)) continue;
8071+
memcpy(merged, str_a, (size_t)la);
8072+
memcpy(merged + la, str_b, (size_t)lb);
8073+
merged[la + lb] = '\0';
8074+
8075+
/* Look up token IDs via linear scan (sorted_indices not built yet) */
8076+
int id_a = str_lookup(tok, str_a);
8077+
int id_b = str_lookup(tok, str_b);
8078+
int id_merged = str_lookup(tok, merged);
8079+
8080+
if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
8081+
tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
8082+
tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
8083+
tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
8084+
/* Priority: earlier merges in GGUF = higher priority */
8085+
tok->scores[id_merged] = (float)(n_merges_total - mi);
8086+
tok->n_merges++;
8087+
}
8088+
}
8089+
fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
8090+
tok->n_merges, (int)n_merges_total);
80488091
}
80498092
}
80508093
}

src/engine/tq_tokenizer.c

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -881,18 +881,61 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
881881
}
882882
}
883883

884-
/* Load merges if available */
884+
/* Load and parse merges if available.
885+
* GGUF stores merges as a string array of "tok_a tok_b" pairs.
886+
* We need to look up token IDs and build (id_a, id_b, id_merged) triples
887+
* so the BPE encoder can use them. */
885888
int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
886889
if (merges_idx >= 0) {
887890
const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
888891
if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
889892
mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
890-
/* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
891-
uint64_t n_merges = mkv->value.array.count;
892-
tok->n_merges = (int)n_merges;
893-
tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
893+
uint64_t n_merges_total = mkv->value.array.count;
894+
tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
895+
tok->n_merges = 0;
894896
if (tok->merge_pairs) {
895-
memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
897+
tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
898+
for (uint64_t mi = 0; mi < n_merges_total; mi++) {
899+
if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
900+
901+
/* Copy merge string and split on space: "tok_a tok_b" */
902+
char buf[2048];
903+
int slen = (int)merge_strings[mi].len;
904+
if (slen >= (int)sizeof(buf)) continue;
905+
memcpy(buf, merge_strings[mi].str, (size_t)slen);
906+
buf[slen] = '\0';
907+
908+
char* sep = strchr(buf, ' ');
909+
if (!sep) continue;
910+
*sep = '\0';
911+
const char* str_a = buf;
912+
const char* str_b = sep + 1;
913+
914+
/* Build merged string: concatenation of tok_a + tok_b */
915+
char merged[2048];
916+
int la = (int)strlen(str_a);
917+
int lb = (int)strlen(str_b);
918+
if (la + lb >= (int)sizeof(merged)) continue;
919+
memcpy(merged, str_a, (size_t)la);
920+
memcpy(merged + la, str_b, (size_t)lb);
921+
merged[la + lb] = '\0';
922+
923+
/* Look up token IDs via linear scan (sorted_indices not built yet) */
924+
int id_a = str_lookup(tok, str_a);
925+
int id_b = str_lookup(tok, str_b);
926+
int id_merged = str_lookup(tok, merged);
927+
928+
if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
929+
tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
930+
tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
931+
tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
932+
/* Priority: earlier merges in GGUF = higher priority */
933+
tok->scores[id_merged] = (float)(n_merges_total - mi);
934+
tok->n_merges++;
935+
}
936+
}
937+
fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
938+
tok->n_merges, (int)n_merges_total);
896939
}
897940
}
898941
}

0 commit comments

Comments
 (0)