feat: Gemma 4 26B-A4B — model loads, tokens generated, hybrid attention WIP

unamedkr · claude · unamedkr · commit 900d6bc736ad · 2026-04-03T08:18:09.000+09:00
Gemma 4 progress:
- GGUF architecture 'gemma4' correctly detected as Gemma family (model_type=1)
- sliding_window=1024 read from GGUF metadata (uint32 type)
- layer_is_sliding array populated from Q tensor shapes (25 sliding + 5 full)
- Sliding head_dim=256 auto-detected from blk.0.attn_k shape
- EOS fix: Gemma EOS=1, not 2 (2 is BOS for Gemma)
- Forward pass produces valid logits (no NaN)
- Token generation works but repeats (per-layer head_dim for full layers incomplete)

Remaining: full attention layers (5,11,17,23,29) need head_dim=512 + kv_heads=2
instead of sliding's head_dim=256 + kv_heads=8.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -262,7 +262,8 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     /* EOS token IDs — check common values.
      * Qwen3.5: eos = 248044 (<|endoftext|>), also 248046 (<|im_end|>)
      * LLaMA: eos = 2 */
-    int eos_token1 = 2;       /* LLaMA convention */
+    /* EOS tokens — Gemma=1, Qwen=248044/248046 */
+    int eos_token1 = 1;       /* Gemma <eos>, also common default */
     int eos_token2 = 248044;  /* Qwen <|endoftext|> */
     int eos_token3 = 248046;  /* Qwen <|im_end|> */
 
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -2829,18 +2829,53 @@ tq_model_t* tq_load_gguf(const char* path) {
     c->rope_freq_base   = tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 1000000.0f);
     c->rms_norm_eps     = tq_gguf_get_f32(gguf, GGUF_KEY("attention.layer_norm_rms_epsilon"), 1e-6f);
 
+    /* Sliding window + local RoPE base */
+    c->sliding_window = (int)tq_gguf_get_u32(gguf, GGUF_KEY("attention.sliding_window"), 0);
+    c->rope_local_base_freq = tq_gguf_get_f32(gguf, GGUF_KEY("rope.local.freq_base"),
+                               tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 10000.0f));
+
     /* Cap context for memory safety on small machines.
      * GGUF models often claim 262K context but we cap at 4096 by default.
      * Users can override with --ctx flag in tq_run. */
     if (c->max_seq_len > 4096) c->max_seq_len = 4096;
 
-    /* Compute head_dim — prefer explicit key_length from metadata (Qwen3.5 has
-     * head_dim > hidden_dim/n_heads because attention expands the dimension) */
+    /* Compute head_dim — prefer explicit key_length from metadata.
+     * For Gemma 4: key_length=512 is for full attention layers,
+     * but sliding layers use 256. Detect from first layer's K tensor shape. */
     c->head_dim = tq_gguf_get_i32(gguf, GGUF_KEY("attention.key_length"), 0);
     if (c->head_dim == 0 && c->n_heads > 0) {
         c->head_dim = c->hidden_dim / c->n_heads;
     }
 
+    /* For hybrid sliding/full attention (Gemma 4):
+     * Override head_dim from first layer's K tensor shape (sliding layer),
+     * since sliding layers are the majority and determine KV cache layout. */
+    {
+        const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
+        if (k0 && k0->n_dims >= 2) {
+            int k_out = (int)k0->shape[1];
+            /* Try head_dim candidates: check if k_out / head_dim gives integer kv_heads */
+            /* Try from largest to smallest to prefer larger head_dim */
+            int sliding_head_dim = c->head_dim;
+            for (int hd = 512; hd >= 64; hd /= 2) {
+                if (k_out % hd == 0) {
+                    int kv = k_out / hd;
+                    if (kv >= 1 && kv <= c->n_heads && hd < c->head_dim) {
+                        sliding_head_dim = hd;
+                        break;
+                    }
+                }
+            }
+            if (sliding_head_dim != c->head_dim) {
+                fprintf(stderr, "tq_load_gguf: hybrid attention detected — "
+                        "sliding head_dim=%d (metadata: %d)\n", sliding_head_dim, c->head_dim);
+                c->head_dim = sliding_head_dim;
+            }
+            /* Infer kv_heads from K tensor shape */
+            c->n_kv_heads = k_out / c->head_dim;
+        }
+    }
+
     /* MoE configuration */
     c->num_experts        = tq_gguf_get_i32(gguf, GGUF_KEY("expert_count"), 0);
     c->num_active_experts = tq_gguf_get_i32(gguf, GGUF_KEY("expert_used_count"), 0);
@@ -2873,11 +2908,15 @@ tq_model_t* tq_load_gguf(const char* path) {
                 c->expert_intermediate_dim, c->has_shared_expert);
     }
 
-    /* Model type detection */
-    if (c->is_moe) {
-        c->model_type = 2; /* qwen2moe / qwen3.5 moe */
+    /* Model type detection — Gemma takes priority (Gemma 4 is both Gemma AND MoE) */
+    if (strstr(gguf->arch, "gemma") != NULL) {
+        c->model_type = 1; /* gemma family */
+        c->n_norms_per_block = 4;
+        fprintf(stderr, "tq_load_gguf: Gemma family detected (sliding_window=%d)\n", c->sliding_window);
+    } else if (c->is_moe) {
+        c->model_type = 2; /* qwen moe */
     } else {
-        c->model_type = 0; /* default qwen35 */
+        c->model_type = 0; /* qwen35 */
     }
 
     fprintf(stderr, "tq_load_gguf: config — layers=%d, dim=%d, heads=%d/%d, head_dim=%d, vocab=%d\n",
@@ -3206,6 +3245,39 @@ tq_model_t* tq_load_gguf(const char* path) {
                 n_attn_layers, c->n_layers);
     }
 
+    /* Set up layer_is_sliding for Gemma hybrid attention.
+     * Detect from Q tensor shape: sliding layers have smaller Q output dim. */
+    if (c->sliding_window > 0 && c->model_type == 1) {
+        model->layer_is_sliding = (int*)calloc((size_t)c->n_layers, sizeof(int));
+        if (model->layer_is_sliding) {
+            /* Find the smallest Q output dim (sliding) */
+            int min_q = 999999;
+            for (int l = 0; l < c->n_layers; l++) {
+                char tname[128];
+                snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
+                const tq_gguf_tensor_t* qt = tq_gguf_find_tensor(gguf, tname);
+                if (qt && (int)qt->shape[1] < min_q) min_q = (int)qt->shape[1];
+            }
+            int n_sliding = 0, n_full = 0;
+            for (int l = 0; l < c->n_layers; l++) {
+                char tname[128];
+                snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
+                const tq_gguf_tensor_t* qt = tq_gguf_find_tensor(gguf, tname);
+                if (qt && (int)qt->shape[1] == min_q) {
+                    model->layer_is_sliding[l] = 1;
+                    n_sliding++;
+                } else {
+                    model->layer_is_sliding[l] = 0;
+                    n_full++;
+                }
+            }
+            if (n_full > 0) {
+                fprintf(stderr, "tq_load_gguf: Gemma hybrid — %d sliding + %d full attention layers\n",
+                        n_sliding, n_full);
+            }
+        }
+    }
+
     /* Load embedding + output weights */
     const tq_gguf_tensor_t* emb_t = find_gguf_tensor(gguf, "token_embd.weight");
     if (emb_t) {
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -860,9 +860,34 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     int head_dim = c->head_dim;
     int n_heads = c->n_heads;
     int n_kv_heads = c->n_kv_heads;
+
+    /* Gemma 4 hybrid: full attention layers have different head_dim and kv_heads.
+     * Detect from GGUF weight shapes: if Q output > n_heads * head_dim, it's a full layer. */
+    if (model->layer_is_sliding && !model->layer_is_sliding[l] && layer->gguf_wq) {
+        /* Full attention layer: infer head_dim from Q tensor.
+         * Q shape = [hidden_dim, n_heads * full_head_dim * (1 + gate)] */
+        int q_out = 0;
+        /* Get Q output dim from GGUF tensor — stored at load time in gguf_wq_type's neighbor.
+         * Simpler: compute from expected: global_head_dim = metadata key_length */
+        int global_head_dim = tq_gguf_get_i32((const tq_gguf_ctx_t*)model->gguf_ctx,
+            "gemma4.attention.key_length", head_dim);
+        if (global_head_dim > head_dim) {
+            head_dim = global_head_dim;
+            /* For full layers, kv_heads is typically smaller */
+            /* K shape for full: [dim, kv_heads_full * global_head_dim]
+             * We know K_out from sliding kv_dim * (global/sliding) ratio... or just compute:
+             * Total Q = n_heads * global_head_dim = 16 * 512 = 8192
+             * Total K = ? from tensor. For now, infer: */
+            n_kv_heads = c->n_kv_heads * c->head_dim / global_head_dim;
+            if (n_kv_heads < 1) n_kv_heads = 1;
+        }
+    }
+
     int kv_dim = n_kv_heads * head_dim;
     int kv_mul = n_heads / n_kv_heads;
-    size_t kv_layer_stride = (size_t)c->max_seq_len * kv_dim;
+    /* KV cache stride uses the global (sliding) config for uniform allocation */
+    int cache_kv_dim = c->n_kv_heads * c->head_dim;
+    size_t kv_layer_stride = (size_t)c->max_seq_len * cache_kv_dim;
 
     /* Pre-quantize activation to Q8 once for all Q2/Q4 projections in this layer.
      * This eliminates redundant tq_quantize_row_q8 + malloc/free in each matmul call. */