Skip to content

Commit 900d6bc

Browse files
unamedkrclaude
andcommitted
feat: Gemma 4 26B-A4B — model loads, tokens generated, hybrid attention WIP
Gemma 4 progress: - GGUF architecture 'gemma4' correctly detected as Gemma family (model_type=1) - sliding_window=1024 read from GGUF metadata (uint32 type) - layer_is_sliding array populated from Q tensor shapes (25 sliding + 5 full) - Sliding head_dim=256 auto-detected from blk.0.attn_k shape - EOS fix: Gemma EOS=1, not 2 (2 is BOS for Gemma) - Forward pass produces valid logits (no NaN) - Token generation works but repeats (per-layer head_dim for full layers incomplete) Remaining: full attention layers (5,11,17,23,29) need head_dim=512 + kv_heads=2 instead of sliding's head_dim=256 + kv_heads=8. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 04acadb commit 900d6bc

3 files changed

Lines changed: 106 additions & 8 deletions

File tree

src/engine/tq_generate.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,8 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
262262
/* EOS token IDs — check common values.
263263
* Qwen3.5: eos = 248044 (<|endoftext|>), also 248046 (<|im_end|>)
264264
* LLaMA: eos = 2 */
265-
int eos_token1 = 2; /* LLaMA convention */
265+
/* EOS tokens — Gemma=1, Qwen=248044/248046 */
266+
int eos_token1 = 1; /* Gemma <eos>, also common default */
266267
int eos_token2 = 248044; /* Qwen <|endoftext|> */
267268
int eos_token3 = 248046; /* Qwen <|im_end|> */
268269

src/engine/tq_model.c

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2829,18 +2829,53 @@ tq_model_t* tq_load_gguf(const char* path) {
28292829
c->rope_freq_base = tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 1000000.0f);
28302830
c->rms_norm_eps = tq_gguf_get_f32(gguf, GGUF_KEY("attention.layer_norm_rms_epsilon"), 1e-6f);
28312831

2832+
/* Sliding window + local RoPE base */
2833+
c->sliding_window = (int)tq_gguf_get_u32(gguf, GGUF_KEY("attention.sliding_window"), 0);
2834+
c->rope_local_base_freq = tq_gguf_get_f32(gguf, GGUF_KEY("rope.local.freq_base"),
2835+
tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 10000.0f));
2836+
28322837
/* Cap context for memory safety on small machines.
28332838
* GGUF models often claim 262K context but we cap at 4096 by default.
28342839
* Users can override with --ctx flag in tq_run. */
28352840
if (c->max_seq_len > 4096) c->max_seq_len = 4096;
28362841

2837-
/* Compute head_dim — prefer explicit key_length from metadata (Qwen3.5 has
2838-
* head_dim > hidden_dim/n_heads because attention expands the dimension) */
2842+
/* Compute head_dim — prefer explicit key_length from metadata.
2843+
* For Gemma 4: key_length=512 is for full attention layers,
2844+
* but sliding layers use 256. Detect from first layer's K tensor shape. */
28392845
c->head_dim = tq_gguf_get_i32(gguf, GGUF_KEY("attention.key_length"), 0);
28402846
if (c->head_dim == 0 && c->n_heads > 0) {
28412847
c->head_dim = c->hidden_dim / c->n_heads;
28422848
}
28432849

2850+
/* For hybrid sliding/full attention (Gemma 4):
2851+
* Override head_dim from first layer's K tensor shape (sliding layer),
2852+
* since sliding layers are the majority and determine KV cache layout. */
2853+
{
2854+
const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
2855+
if (k0 && k0->n_dims >= 2) {
2856+
int k_out = (int)k0->shape[1];
2857+
/* Try head_dim candidates: check if k_out / head_dim gives integer kv_heads */
2858+
/* Try from largest to smallest to prefer larger head_dim */
2859+
int sliding_head_dim = c->head_dim;
2860+
for (int hd = 512; hd >= 64; hd /= 2) {
2861+
if (k_out % hd == 0) {
2862+
int kv = k_out / hd;
2863+
if (kv >= 1 && kv <= c->n_heads && hd < c->head_dim) {
2864+
sliding_head_dim = hd;
2865+
break;
2866+
}
2867+
}
2868+
}
2869+
if (sliding_head_dim != c->head_dim) {
2870+
fprintf(stderr, "tq_load_gguf: hybrid attention detected — "
2871+
"sliding head_dim=%d (metadata: %d)\n", sliding_head_dim, c->head_dim);
2872+
c->head_dim = sliding_head_dim;
2873+
}
2874+
/* Infer kv_heads from K tensor shape */
2875+
c->n_kv_heads = k_out / c->head_dim;
2876+
}
2877+
}
2878+
28442879
/* MoE configuration */
28452880
c->num_experts = tq_gguf_get_i32(gguf, GGUF_KEY("expert_count"), 0);
28462881
c->num_active_experts = tq_gguf_get_i32(gguf, GGUF_KEY("expert_used_count"), 0);
@@ -2873,11 +2908,15 @@ tq_model_t* tq_load_gguf(const char* path) {
28732908
c->expert_intermediate_dim, c->has_shared_expert);
28742909
}
28752910

2876-
/* Model type detection */
2877-
if (c->is_moe) {
2878-
c->model_type = 2; /* qwen2moe / qwen3.5 moe */
2911+
/* Model type detection — Gemma takes priority (Gemma 4 is both Gemma AND MoE) */
2912+
if (strstr(gguf->arch, "gemma") != NULL) {
2913+
c->model_type = 1; /* gemma family */
2914+
c->n_norms_per_block = 4;
2915+
fprintf(stderr, "tq_load_gguf: Gemma family detected (sliding_window=%d)\n", c->sliding_window);
2916+
} else if (c->is_moe) {
2917+
c->model_type = 2; /* qwen moe */
28792918
} else {
2880-
c->model_type = 0; /* default qwen35 */
2919+
c->model_type = 0; /* qwen35 */
28812920
}
28822921

28832922
fprintf(stderr, "tq_load_gguf: config — layers=%d, dim=%d, heads=%d/%d, head_dim=%d, vocab=%d\n",
@@ -3206,6 +3245,39 @@ tq_model_t* tq_load_gguf(const char* path) {
32063245
n_attn_layers, c->n_layers);
32073246
}
32083247

3248+
/* Set up layer_is_sliding for Gemma hybrid attention.
3249+
* Detect from Q tensor shape: sliding layers have smaller Q output dim. */
3250+
if (c->sliding_window > 0 && c->model_type == 1) {
3251+
model->layer_is_sliding = (int*)calloc((size_t)c->n_layers, sizeof(int));
3252+
if (model->layer_is_sliding) {
3253+
/* Find the smallest Q output dim (sliding) */
3254+
int min_q = 999999;
3255+
for (int l = 0; l < c->n_layers; l++) {
3256+
char tname[128];
3257+
snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
3258+
const tq_gguf_tensor_t* qt = tq_gguf_find_tensor(gguf, tname);
3259+
if (qt && (int)qt->shape[1] < min_q) min_q = (int)qt->shape[1];
3260+
}
3261+
int n_sliding = 0, n_full = 0;
3262+
for (int l = 0; l < c->n_layers; l++) {
3263+
char tname[128];
3264+
snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
3265+
const tq_gguf_tensor_t* qt = tq_gguf_find_tensor(gguf, tname);
3266+
if (qt && (int)qt->shape[1] == min_q) {
3267+
model->layer_is_sliding[l] = 1;
3268+
n_sliding++;
3269+
} else {
3270+
model->layer_is_sliding[l] = 0;
3271+
n_full++;
3272+
}
3273+
}
3274+
if (n_full > 0) {
3275+
fprintf(stderr, "tq_load_gguf: Gemma hybrid — %d sliding + %d full attention layers\n",
3276+
n_sliding, n_full);
3277+
}
3278+
}
3279+
}
3280+
32093281
/* Load embedding + output weights */
32103282
const tq_gguf_tensor_t* emb_t = find_gguf_tensor(gguf, "token_embd.weight");
32113283
if (emb_t) {

src/engine/tq_transformer.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,9 +860,34 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
860860
int head_dim = c->head_dim;
861861
int n_heads = c->n_heads;
862862
int n_kv_heads = c->n_kv_heads;
863+
864+
/* Gemma 4 hybrid: full attention layers have different head_dim and kv_heads.
865+
* Detect from GGUF weight shapes: if Q output > n_heads * head_dim, it's a full layer. */
866+
if (model->layer_is_sliding && !model->layer_is_sliding[l] && layer->gguf_wq) {
867+
/* Full attention layer: infer head_dim from Q tensor.
868+
* Q shape = [hidden_dim, n_heads * full_head_dim * (1 + gate)] */
869+
int q_out = 0;
870+
/* Get Q output dim from GGUF tensor — stored at load time in gguf_wq_type's neighbor.
871+
* Simpler: compute from expected: global_head_dim = metadata key_length */
872+
int global_head_dim = tq_gguf_get_i32((const tq_gguf_ctx_t*)model->gguf_ctx,
873+
"gemma4.attention.key_length", head_dim);
874+
if (global_head_dim > head_dim) {
875+
head_dim = global_head_dim;
876+
/* For full layers, kv_heads is typically smaller */
877+
/* K shape for full: [dim, kv_heads_full * global_head_dim]
878+
* We know K_out from sliding kv_dim * (global/sliding) ratio... or just compute:
879+
* Total Q = n_heads * global_head_dim = 16 * 512 = 8192
880+
* Total K = ? from tensor. For now, infer: */
881+
n_kv_heads = c->n_kv_heads * c->head_dim / global_head_dim;
882+
if (n_kv_heads < 1) n_kv_heads = 1;
883+
}
884+
}
885+
863886
int kv_dim = n_kv_heads * head_dim;
864887
int kv_mul = n_heads / n_kv_heads;
865-
size_t kv_layer_stride = (size_t)c->max_seq_len * kv_dim;
888+
/* KV cache stride uses the global (sliding) config for uniform allocation */
889+
int cache_kv_dim = c->n_kv_heads * c->head_dim;
890+
size_t kv_layer_stride = (size_t)c->max_seq_len * cache_kv_dim;
866891

867892
/* Pre-quantize activation to Q8 once for all Q2/Q4 projections in this layer.
868893
* This eliminates redundant tq_quantize_row_q8 + malloc/free in each matmul call. */

0 commit comments

Comments
 (0)