Fix Gemma 4 NaN regression + clean output filtering

unamedkr · claude · unamedkr · commit 926e7c310af2 · 2026-04-04T23:48:23.000+09:00
- Fix: hybrid attention detection used model_type before it was set,
  causing Gemma 4 to skip sliding/full detection → NaN logits.
  Now uses gguf-&gt;arch string directly.
- Filter Gemma 4 noise: thought (exact match), &lt;channel|&gt;, &lt;tool|&gt;, &lt;mask&gt;
- Filter Llama 3 noise: &lt;|reserved_special_token|&gt;, &lt;1st&gt;/&lt;2nd&gt;/&lt;3rd&gt;
- Text-based stop detection: &lt;|start_header_id|&gt;, &lt;|eot_id|&gt;, &lt;|im_end|&gt;
- Accumulated output scan for multi-token turn markers

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -297,6 +297,9 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         2,       /* LLaMA 2 </s> */
         106,     /* Gemma4 <end_of_turn> */
         128001,  /* LLaMA 3 <|end_of_text|> */
+        128006,  /* LLaMA 3 <|start_header_id|> (new turn = stop) */
+        128007,  /* LLaMA 3 <|end_header_id|> */
+        128008,  /* LLaMA 3 <|start_of_role|> */
         128009,  /* LLaMA 3 <|eot_id|> */
         248044,  /* Qwen <|endoftext|> */
         248046,  /* Qwen <|im_end|> */
@@ -318,14 +321,48 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
 
             /* Skip special/thinking tokens that shouldn't appear in output.
              * Qwen3.5: <think>...</think>
-             * Gemma 4: thought, <channel|>, <tool|>, <mask>, <unused*> */
+             * Gemma 4: thought, <channel|>, <tool|>, <mask>, <unused*>
+             * LLaMA 3: <|start_header_id|>, <|reserved_special_token_*|> */
+            int should_stop = 0;
             if (piece) {
                 if (strstr(piece, "<think>") || strstr(piece, "</think>") ||
-                    strstr(piece, "thought") || strstr(piece, "<channel|>") ||
-                    strstr(piece, "<tool|>") || strstr(piece, "<mask>") ||
+                    strstr(piece, "<channel|>") || strstr(piece, "<tool|>") ||
+                    strstr(piece, "<mask>") ||
                     strstr(piece, "<unused") || strstr(piece, "<|think")) {
                     piece = "";
                 }
+                /* Gemma 4 "thought" token: only filter if it's the EXACT piece
+                 * (not a substring of normal text like "thoughtful") */
+                if (piece[0] != '\0' && strcmp(piece, "thought") == 0) {
+                    piece = "";
+                }
+                /* Stop generation on turn-boundary tokens (LLaMA 3 / Qwen only).
+                 * Gemma uses token ID-based EOS (106), not text-based detection. */
+                if (strstr(piece, "<|start_header_id|>") ||
+                    strstr(piece, "<|eot_id|>") ||
+                    strstr(piece, "<|im_end|>")) {
+                    should_stop = 1;
+                    piece = "";
+                }
+                /* Filter reserved special tokens */
+                if (strstr(piece, "<|reserved_special_token") ||
+                    strstr(piece, "<1st>") || strstr(piece, "<2nd>") || strstr(piece, "<3rd>")) {
+                    piece = "";
+                }
+            }
+            if (should_stop) break;
+
+            /* Also check accumulated output for turn markers that span multiple tokens */
+            if (output && output_pos > 5) {
+                const char* tail = output + (output_pos > 20 ? output_pos - 20 : 0);
+                if (strstr(tail, "<|start_header") || strstr(tail, "<|eot_id") ||
+                    strstr(tail, "<end_of_turn") || strstr(tail, "<|im_end")) {
+                    /* Trim the marker from output */
+                    char* marker = strstr(output + (output_pos > 30 ? output_pos - 30 : 0), "<|");
+                    if (!marker) marker = strstr(output + (output_pos > 30 ? output_pos - 30 : 0), "<end");
+                    if (marker) { *marker = '\0'; output_pos = (int)(marker - output); }
+                    break;
+                }
             }
 
             int piece_len = (int)strlen(piece);
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -2917,8 +2917,10 @@ tq_model_t* tq_load_gguf(const char* path) {
     /* For hybrid sliding/full attention (Gemma 3/4 only):
      * Override head_dim from first layer's K tensor shape (sliding layer),
      * since sliding layers are the majority and determine KV cache layout.
-     * NOTE: only for Gemma family — Llama/Qwen use uniform head_dim. */
-    if (c->model_type == 1 && c->sliding_window > 0) {
+     * NOTE: only for Gemma family — Llama/Qwen use uniform head_dim.
+     * Use arch string directly since model_type hasn't been set yet. */
+    int is_gemma_arch = (strstr(gguf->arch, "gemma") != NULL);
+    if (is_gemma_arch && c->sliding_window > 0) {
         const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
         if (k0 && k0->n_dims >= 2) {
             int k_out = (int)k0->shape[1];