Llama 3.2 support + thought token filtering + EOS handling

unamedkr · claude · unamedkr · commit 528582a4d2bb · 2026-04-04T23:13:04.000+09:00
- Fix hybrid attention detection: restrict to Gemma only (was breaking Llama
  by misdetecting head_dim=64 instead of 128 due to GQA kv_heads)
- Llama 3.2 3B Instruct: verified, 11.6 tok/s, correct code generation
- Filter Gemma 4 thinking tokens: thought, &lt;channel|&gt;, &lt;tool|&gt;, &lt;mask&gt;, &lt;unused*&gt;
- Add Llama 3 EOS tokens: 128001 (&lt;|end_of_text|&gt;), 128009 (&lt;|eot_id|&gt;)
- Clean output: "The capital of France is **Paris**." (no noise tokens)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/README.ko.md b/README.ko.md
@@ -173,6 +173,7 @@ cmake --build build -j$(nproc)
 | Qwen3.5-4B | Qwen3.5 (DeltaNet) | 4B | PPL 검증 |
 | Qwen3.5-35B-A3B | Qwen2-MoE | 35B (3B active) | 동작 |
 | Gemma 3 270M | Gemma 3 | 270M | 동작 |
+| **Llama 3.2 3B-Instruct** | **Llama 3** | **3B** | **검증 완료 (11.6 tok/s)** |
 | **Gemma 4 26B-A4B-it** | **Gemma 4 MoE** | **26B (4B active)** | **검증 완료** |
 
 ### Gemma 4 26B-A4B (NEW)
diff --git a/README.md b/README.md
@@ -179,6 +179,7 @@ Cross-model (4b K + Q4 V): SmolLM2 1.7B (-1.6%), Qwen3.5 0.8B (+0.9%), Qwen3.5 4
 | Qwen3.5-4B | Qwen3.5 (DeltaNet) | 4B | PPL verified |
 | Qwen3.5-35B-A3B | Qwen2-MoE | 35B (3B active) | Working |
 | Gemma 3 270M | Gemma 3 | 270M | Working |
+| **Llama 3.2 3B-Instruct** | **Llama 3** | **3B** | **Verified (11.6 tok/s)** |
 | **Gemma 4 26B-A4B-it** | **Gemma 4 MoE** | **26B (4B active)** | **Verified** |
 
 ### Gemma 4 26B-A4B (NEW)
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -209,15 +209,16 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int n_prompt = 0;
 
     if (tokenizer && prompt) {
-        /* Gemma models: prepend BOS=2 (required by both Gemma 3 and 4 architectures).
-         * Qwen3.5: no BOS. */
+        /* BOS token handling:
+         * Gemma 3/4: BOS=2 (required)
+         * LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it
+         * Qwen3.5: no BOS needed */
         int add_bos = 0;
         if (model->config.model_type == 1) {
-            add_bos = 1; /* All Gemma models need BOS */
+            add_bos = 1; /* Gemma: always prepend BOS=2 */
         }
         n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
     } else {
-        /* No tokenizer: use BOS only (Gemma=2, Qwen=skip) */
         prompt_tokens[0] = (model->config.model_type == 1) ? 2 : 1;
         n_prompt = 1;
     }
@@ -285,29 +286,46 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int output_pos = 0;
     int prev_token = prompt_tokens[n_prompt - 1];
 
-    /* EOS token IDs — check common values.
-     * Qwen3.5: eos = 248044 (<|endoftext|>), also 248046 (<|im_end|>)
+    /* EOS token IDs — check common values across model families.
+     * Qwen3.5: eos = 248044 (<|endoftext|>), 248046 (<|im_end|>)
      * Gemma3: eos = 1
      * Gemma4: eos = 106 (<end_of_turn>)
-     * LLaMA: eos = 2 */
-    int eos_token1 = 1;       /* Gemma3 <eos>, also common default */
-    int eos_token2 = 248044;  /* Qwen <|endoftext|> */
-    int eos_token3 = 248046;  /* Qwen <|im_end|> */
-    int eos_token4 = 106;     /* Gemma4 <end_of_turn> */
+     * LLaMA 2: eos = 2
+     * LLaMA 3: eos = 128001 (<|end_of_text|>), 128009 (<|eot_id|>) */
+    int eos_tokens[] = {
+        1,       /* Gemma3 <eos> */
+        2,       /* LLaMA 2 </s> */
+        106,     /* Gemma4 <end_of_turn> */
+        128001,  /* LLaMA 3 <|end_of_text|> */
+        128009,  /* LLaMA 3 <|eot_id|> */
+        248044,  /* Qwen <|endoftext|> */
+        248046,  /* Qwen <|im_end|> */
+    };
+    int n_eos = sizeof(eos_tokens) / sizeof(eos_tokens[0]);
 
     /* Generate loop */
     while (generated < config->max_tokens) {
-        if (next_token == eos_token1 || next_token == eos_token2 ||
-            next_token == eos_token3 || next_token == eos_token4) break;
+        int is_eos = 0;
+        for (int e = 0; e < n_eos; e++) {
+            if (next_token == eos_tokens[e]) { is_eos = 1; break; }
+        }
+        if (is_eos) break;
         if (pos >= model->config.max_seq_len) break;
 
         /* Decode token to text */
         if (tokenizer) {
             const char* piece = tq_decode(tokenizer, prev_token, next_token);
 
-            /* Skip thinking tokens (e.g. Qwen3.5 <think>...</think>) */
-            if (piece && (strstr(piece, "<think>") || strstr(piece, "</think>"))) {
-                piece = "";
+            /* Skip special/thinking tokens that shouldn't appear in output.
+             * Qwen3.5: <think>...</think>
+             * Gemma 4: thought, <channel|>, <tool|>, <mask>, <unused*> */
+            if (piece) {
+                if (strstr(piece, "<think>") || strstr(piece, "</think>") ||
+                    strstr(piece, "thought") || strstr(piece, "<channel|>") ||
+                    strstr(piece, "<tool|>") || strstr(piece, "<mask>") ||
+                    strstr(piece, "<unused") || strstr(piece, "<|think")) {
+                    piece = "";
+                }
             }
 
             int piece_len = (int)strlen(piece);
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -2914,15 +2914,14 @@ tq_model_t* tq_load_gguf(const char* path) {
         c->head_dim = c->hidden_dim / c->n_heads;
     }
 
-    /* For hybrid sliding/full attention (Gemma 4):
+    /* For hybrid sliding/full attention (Gemma 3/4 only):
      * Override head_dim from first layer's K tensor shape (sliding layer),
-     * since sliding layers are the majority and determine KV cache layout. */
-    {
+     * since sliding layers are the majority and determine KV cache layout.
+     * NOTE: only for Gemma family — Llama/Qwen use uniform head_dim. */
+    if (c->model_type == 1 && c->sliding_window > 0) {
         const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
         if (k0 && k0->n_dims >= 2) {
             int k_out = (int)k0->shape[1];
-            /* Try head_dim candidates: check if k_out / head_dim gives integer kv_heads */
-            /* Try from largest to smallest to prefer larger head_dim */
             int sliding_head_dim = c->head_dim;
             for (int hd = 512; hd >= 64; hd /= 2) {
                 if (k_out % hd == 0) {
@@ -2938,9 +2937,17 @@ tq_model_t* tq_load_gguf(const char* path) {
                         "sliding head_dim=%d (metadata: %d)\n", sliding_head_dim, c->head_dim);
                 c->head_dim = sliding_head_dim;
             }
-            /* Infer kv_heads from K tensor shape */
             c->n_kv_heads = k_out / c->head_dim;
         }
+    } else {
+        /* Non-Gemma: infer kv_heads from K tensor shape with metadata head_dim */
+        const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
+        if (k0 && k0->n_dims >= 2) {
+            int k_out = (int)k0->shape[1];
+            if (c->head_dim > 0 && k_out % c->head_dim == 0) {
+                c->n_kv_heads = k_out / c->head_dim;
+            }
+        }
     }
 
     /* MoE configuration */