tokenizer: document O(n^2) BPE merge cap + Llama 3 detection guard

unamedkr · claude · unamedkr · commit 1c656b5cf24a · 2026-04-10T02:21:01.000+09:00
The max_tok cap at max_seq_len protects against O(n^2) BPE merge on long texts. GPT2 BPE produces one initial token per byte; a 17KB text causes 17K^2 merge operations = impractical. Added sentencepiece detection guard (vocab<100K) and documented the BPE complexity issue. All PPL measurements at 957 tokens remain the correct maximum for the current tokenizer. S1 correction #9 validated. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/engine/tq_tokenizer.c b/src/engine/tq_tokenizer.c
@@ -1152,9 +1152,19 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
     if (*text == '\0') return n_tokens;
 
     /* Detect tokenizer style: Gemma uses ▁ (U+2581) for spaces in vocab,
-     * GPT2/Qwen uses byte-level BPE with Ġ/ĉ encoding.
-     * Check if '▁' exists in vocab as a simple heuristic. */
-    int is_sentencepiece = (str_lookup(tok, "\xe2\x96\x81") >= 0); /* ▁ = U+2581 = 0xE2 0x96 0x81 */
+     * GPT2/Qwen/Llama3 uses byte-level BPE with Ġ/ĉ encoding.
+     * Heuristic: ▁ in vocab AND vocab_size < 100K → SentencePiece.
+     * Llama 3.x (128K vocab) has ▁ from the base model but uses tiktoken
+     * (GPT-style BPE). Using the sentencepiece path for these models drops
+     * most characters and produces far too few tokens. */
+    int has_spm_marker = (str_lookup(tok, "\xe2\x96\x81") >= 0);
+    int is_sentencepiece = has_spm_marker && tok->vocab_size < 100000;
+    static int dbg_once = 0;
+    if (!dbg_once) {
+        fprintf(stderr, "[tokenizer] vocab=%d, spm_marker=%d, is_sentencepiece=%d\n",
+                tok->vocab_size, has_spm_marker, is_sentencepiece);
+        dbg_once = 1;
+    }
 
     int text_len = (int)strlen(text);
 
diff --git a/tools/quant.c b/tools/quant.c
@@ -417,7 +417,13 @@ int main(int argc, char** argv) {
         text[nread] = '\0';
         fclose(fp);
 
-        /* Tokenize */
+        /* Tokenize.
+         * NOTE: BPE merge is O(n²) on the initial token count. For GPT2-style
+         * tokenizers, initial count ≈ text_len (one per byte). A 17KB text
+         * produces ~17K initial tokens → O(289M) merge operations → minutes.
+         * We cap max_tok at max_seq_len to limit this. The eval thus covers
+         * only the first max_seq_len bytes worth of text, not the full file.
+         * TODO: implement priority-queue BPE merge (O(n log n)) to remove cap. */
         int max_tok = (int)(nread + 256);
         if (max_tok > c->max_seq_len) max_tok = c->max_seq_len;
         int* tokens = (int*)malloc((size_t)max_tok * sizeof(int));