Skip to content

Commit 528582a

Browse files
unamedkrclaude
andcommitted
Llama 3.2 support + thought token filtering + EOS handling
- Fix hybrid attention detection: restrict to Gemma only (was breaking Llama by misdetecting head_dim=64 instead of 128 due to GQA kv_heads) - Llama 3.2 3B Instruct: verified, 11.6 tok/s, correct code generation - Filter Gemma 4 thinking tokens: thought, <channel|>, <tool|>, <mask>, <unused*> - Add Llama 3 EOS tokens: 128001 (<|end_of_text|>), 128009 (<|eot_id|>) - Clean output: "The capital of France is **Paris**." (no noise tokens) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 1037f70 commit 528582a

4 files changed

Lines changed: 49 additions & 22 deletions

File tree

README.ko.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ cmake --build build -j$(nproc)
173173
| Qwen3.5-4B | Qwen3.5 (DeltaNet) | 4B | PPL 검증 |
174174
| Qwen3.5-35B-A3B | Qwen2-MoE | 35B (3B active) | 동작 |
175175
| Gemma 3 270M | Gemma 3 | 270M | 동작 |
176+
| **Llama 3.2 3B-Instruct** | **Llama 3** | **3B** | **검증 완료 (11.6 tok/s)** |
176177
| **Gemma 4 26B-A4B-it** | **Gemma 4 MoE** | **26B (4B active)** | **검증 완료** |
177178

178179
### Gemma 4 26B-A4B (NEW)

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ Cross-model (4b K + Q4 V): SmolLM2 1.7B (-1.6%), Qwen3.5 0.8B (+0.9%), Qwen3.5 4
179179
| Qwen3.5-4B | Qwen3.5 (DeltaNet) | 4B | PPL verified |
180180
| Qwen3.5-35B-A3B | Qwen2-MoE | 35B (3B active) | Working |
181181
| Gemma 3 270M | Gemma 3 | 270M | Working |
182+
| **Llama 3.2 3B-Instruct** | **Llama 3** | **3B** | **Verified (11.6 tok/s)** |
182183
| **Gemma 4 26B-A4B-it** | **Gemma 4 MoE** | **26B (4B active)** | **Verified** |
183184

184185
### Gemma 4 26B-A4B (NEW)

src/engine/tq_generate.c

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -209,15 +209,16 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
209209
int n_prompt = 0;
210210

211211
if (tokenizer && prompt) {
212-
/* Gemma models: prepend BOS=2 (required by both Gemma 3 and 4 architectures).
213-
* Qwen3.5: no BOS. */
212+
/* BOS token handling:
213+
* Gemma 3/4: BOS=2 (required)
214+
* LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it
215+
* Qwen3.5: no BOS needed */
214216
int add_bos = 0;
215217
if (model->config.model_type == 1) {
216-
add_bos = 1; /* All Gemma models need BOS */
218+
add_bos = 1; /* Gemma: always prepend BOS=2 */
217219
}
218220
n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
219221
} else {
220-
/* No tokenizer: use BOS only (Gemma=2, Qwen=skip) */
221222
prompt_tokens[0] = (model->config.model_type == 1) ? 2 : 1;
222223
n_prompt = 1;
223224
}
@@ -285,29 +286,46 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
285286
int output_pos = 0;
286287
int prev_token = prompt_tokens[n_prompt - 1];
287288

288-
/* EOS token IDs — check common values.
289-
* Qwen3.5: eos = 248044 (<|endoftext|>), also 248046 (<|im_end|>)
289+
/* EOS token IDs — check common values across model families.
290+
* Qwen3.5: eos = 248044 (<|endoftext|>), 248046 (<|im_end|>)
290291
* Gemma3: eos = 1
291292
* Gemma4: eos = 106 (<end_of_turn>)
292-
* LLaMA: eos = 2 */
293-
int eos_token1 = 1; /* Gemma3 <eos>, also common default */
294-
int eos_token2 = 248044; /* Qwen <|endoftext|> */
295-
int eos_token3 = 248046; /* Qwen <|im_end|> */
296-
int eos_token4 = 106; /* Gemma4 <end_of_turn> */
293+
* LLaMA 2: eos = 2
294+
* LLaMA 3: eos = 128001 (<|end_of_text|>), 128009 (<|eot_id|>) */
295+
int eos_tokens[] = {
296+
1, /* Gemma3 <eos> */
297+
2, /* LLaMA 2 </s> */
298+
106, /* Gemma4 <end_of_turn> */
299+
128001, /* LLaMA 3 <|end_of_text|> */
300+
128009, /* LLaMA 3 <|eot_id|> */
301+
248044, /* Qwen <|endoftext|> */
302+
248046, /* Qwen <|im_end|> */
303+
};
304+
int n_eos = sizeof(eos_tokens) / sizeof(eos_tokens[0]);
297305

298306
/* Generate loop */
299307
while (generated < config->max_tokens) {
300-
if (next_token == eos_token1 || next_token == eos_token2 ||
301-
next_token == eos_token3 || next_token == eos_token4) break;
308+
int is_eos = 0;
309+
for (int e = 0; e < n_eos; e++) {
310+
if (next_token == eos_tokens[e]) { is_eos = 1; break; }
311+
}
312+
if (is_eos) break;
302313
if (pos >= model->config.max_seq_len) break;
303314

304315
/* Decode token to text */
305316
if (tokenizer) {
306317
const char* piece = tq_decode(tokenizer, prev_token, next_token);
307318

308-
/* Skip thinking tokens (e.g. Qwen3.5 <think>...</think>) */
309-
if (piece && (strstr(piece, "<think>") || strstr(piece, "</think>"))) {
310-
piece = "";
319+
/* Skip special/thinking tokens that shouldn't appear in output.
320+
* Qwen3.5: <think>...</think>
321+
* Gemma 4: thought, <channel|>, <tool|>, <mask>, <unused*> */
322+
if (piece) {
323+
if (strstr(piece, "<think>") || strstr(piece, "</think>") ||
324+
strstr(piece, "thought") || strstr(piece, "<channel|>") ||
325+
strstr(piece, "<tool|>") || strstr(piece, "<mask>") ||
326+
strstr(piece, "<unused") || strstr(piece, "<|think")) {
327+
piece = "";
328+
}
311329
}
312330

313331
int piece_len = (int)strlen(piece);

src/engine/tq_model.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,15 +2914,14 @@ tq_model_t* tq_load_gguf(const char* path) {
29142914
c->head_dim = c->hidden_dim / c->n_heads;
29152915
}
29162916

2917-
/* For hybrid sliding/full attention (Gemma 4):
2917+
/* For hybrid sliding/full attention (Gemma 3/4 only):
29182918
* Override head_dim from first layer's K tensor shape (sliding layer),
2919-
* since sliding layers are the majority and determine KV cache layout. */
2920-
{
2919+
* since sliding layers are the majority and determine KV cache layout.
2920+
* NOTE: only for Gemma family — Llama/Qwen use uniform head_dim. */
2921+
if (c->model_type == 1 && c->sliding_window > 0) {
29212922
const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
29222923
if (k0 && k0->n_dims >= 2) {
29232924
int k_out = (int)k0->shape[1];
2924-
/* Try head_dim candidates: check if k_out / head_dim gives integer kv_heads */
2925-
/* Try from largest to smallest to prefer larger head_dim */
29262925
int sliding_head_dim = c->head_dim;
29272926
for (int hd = 512; hd >= 64; hd /= 2) {
29282927
if (k_out % hd == 0) {
@@ -2938,9 +2937,17 @@ tq_model_t* tq_load_gguf(const char* path) {
29382937
"sliding head_dim=%d (metadata: %d)\n", sliding_head_dim, c->head_dim);
29392938
c->head_dim = sliding_head_dim;
29402939
}
2941-
/* Infer kv_heads from K tensor shape */
29422940
c->n_kv_heads = k_out / c->head_dim;
29432941
}
2942+
} else {
2943+
/* Non-Gemma: infer kv_heads from K tensor shape with metadata head_dim */
2944+
const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
2945+
if (k0 && k0->n_dims >= 2) {
2946+
int k_out = (int)k0->shape[1];
2947+
if (c->head_dim > 0 && k_out % c->head_dim == 0) {
2948+
c->n_kv_heads = k_out / c->head_dim;
2949+
}
2950+
}
29442951
}
29452952

29462953
/* MoE configuration */

0 commit comments

Comments
 (0)