Skip to content

Commit 926e7c3

Browse files
unamedkrclaude
andcommitted
Fix Gemma 4 NaN regression + clean output filtering
- Fix: hybrid attention detection used model_type before it was set, causing Gemma 4 to skip sliding/full detection → NaN logits. Now uses gguf->arch string directly. - Filter Gemma 4 noise: thought (exact match), <channel|>, <tool|>, <mask> - Filter Llama 3 noise: <|reserved_special_token|>, <1st>/<2nd>/<3rd> - Text-based stop detection: <|start_header_id|>, <|eot_id|>, <|im_end|> - Accumulated output scan for multi-token turn markers Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b5248cb commit 926e7c3

2 files changed

Lines changed: 44 additions & 5 deletions

File tree

src/engine/tq_generate.c

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,9 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
297297
2, /* LLaMA 2 </s> */
298298
106, /* Gemma4 <end_of_turn> */
299299
128001, /* LLaMA 3 <|end_of_text|> */
300+
128006, /* LLaMA 3 <|start_header_id|> (new turn = stop) */
301+
128007, /* LLaMA 3 <|end_header_id|> */
302+
128008, /* LLaMA 3 <|start_of_role|> */
300303
128009, /* LLaMA 3 <|eot_id|> */
301304
248044, /* Qwen <|endoftext|> */
302305
248046, /* Qwen <|im_end|> */
@@ -318,14 +321,48 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
318321

319322
/* Skip special/thinking tokens that shouldn't appear in output.
320323
* Qwen3.5: <think>...</think>
321-
* Gemma 4: thought, <channel|>, <tool|>, <mask>, <unused*> */
324+
* Gemma 4: thought, <channel|>, <tool|>, <mask>, <unused*>
325+
* LLaMA 3: <|start_header_id|>, <|reserved_special_token_*|> */
326+
int should_stop = 0;
322327
if (piece) {
323328
if (strstr(piece, "<think>") || strstr(piece, "</think>") ||
324-
strstr(piece, "thought") || strstr(piece, "<channel|>") ||
325-
strstr(piece, "<tool|>") || strstr(piece, "<mask>") ||
329+
strstr(piece, "<channel|>") || strstr(piece, "<tool|>") ||
330+
strstr(piece, "<mask>") ||
326331
strstr(piece, "<unused") || strstr(piece, "<|think")) {
327332
piece = "";
328333
}
334+
/* Gemma 4 "thought" token: only filter if it's the EXACT piece
335+
* (not a substring of normal text like "thoughtful") */
336+
if (piece[0] != '\0' && strcmp(piece, "thought") == 0) {
337+
piece = "";
338+
}
339+
/* Stop generation on turn-boundary tokens (LLaMA 3 / Qwen only).
340+
* Gemma uses token ID-based EOS (106), not text-based detection. */
341+
if (strstr(piece, "<|start_header_id|>") ||
342+
strstr(piece, "<|eot_id|>") ||
343+
strstr(piece, "<|im_end|>")) {
344+
should_stop = 1;
345+
piece = "";
346+
}
347+
/* Filter reserved special tokens */
348+
if (strstr(piece, "<|reserved_special_token") ||
349+
strstr(piece, "<1st>") || strstr(piece, "<2nd>") || strstr(piece, "<3rd>")) {
350+
piece = "";
351+
}
352+
}
353+
if (should_stop) break;
354+
355+
/* Also check accumulated output for turn markers that span multiple tokens */
356+
if (output && output_pos > 5) {
357+
const char* tail = output + (output_pos > 20 ? output_pos - 20 : 0);
358+
if (strstr(tail, "<|start_header") || strstr(tail, "<|eot_id") ||
359+
strstr(tail, "<end_of_turn") || strstr(tail, "<|im_end")) {
360+
/* Trim the marker from output */
361+
char* marker = strstr(output + (output_pos > 30 ? output_pos - 30 : 0), "<|");
362+
if (!marker) marker = strstr(output + (output_pos > 30 ? output_pos - 30 : 0), "<end");
363+
if (marker) { *marker = '\0'; output_pos = (int)(marker - output); }
364+
break;
365+
}
329366
}
330367

331368
int piece_len = (int)strlen(piece);

src/engine/tq_model.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2917,8 +2917,10 @@ tq_model_t* tq_load_gguf(const char* path) {
29172917
/* For hybrid sliding/full attention (Gemma 3/4 only):
29182918
* Override head_dim from first layer's K tensor shape (sliding layer),
29192919
* since sliding layers are the majority and determine KV cache layout.
2920-
* NOTE: only for Gemma family — Llama/Qwen use uniform head_dim. */
2921-
if (c->model_type == 1 && c->sliding_window > 0) {
2920+
* NOTE: only for Gemma family — Llama/Qwen use uniform head_dim.
2921+
* Use arch string directly since model_type hasn't been set yet. */
2922+
int is_gemma_arch = (strstr(gguf->arch, "gemma") != NULL);
2923+
if (is_gemma_arch && c->sliding_window > 0) {
29222924
const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
29232925
if (k0 && k0->n_dims >= 2) {
29242926
int k_out = (int)k0->shape[1];

0 commit comments

Comments
 (0)