fix(gemma4): p-RoPE timing fix — apply AFTER hybrid detection

unamedkr · claude · unamedkr · commit f6f513b6f062 · 2026-04-13T02:49:08.000+09:00
Critical bug: the proportional RoPE adjustment for full attention layers
(512 -&gt; 128 dims) was placed BEFORE hybrid attention detection, so
c-&gt;full_head_dim was still 0 at that point → adjustment never ran.

Moved p-RoPE adjustment to after c-&gt;full_head_dim is set (~line 12240).
Now correctly logs: "Gemma4 p-RoPE — full layer RoPE dims 512 -&gt; 128"

Also confirmed: previous "server crashes" were actually curl timeouts
(262K vocab + FP32 weights = very slow lm_head matmul on CPU).

Status: Gemma 4 E2B still produces garbage with all fixes applied:
- RoPE dims: swa=256, full=128 (p-RoPE) ✅
- Attention softcap: disabled for Gemma 4 ✅
- layer_output_scale: simple multiply ✅
- Chat template: Gemma format ✅
- KV sharing: framework ready (off by default) ✅

Remaining hypothesis: residual connection order, sliding window
masking, or weight loading issue in attention projection matrices.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quant.h b/quant.h
@@ -11597,24 +11597,9 @@ tq_model_t* tq_load_gguf(const char* path) {
         /* Gemma 4 (STEP35) detection: architecture string is "gemma4" */
         if (strstr(gguf->arch, "gemma4") != NULL) {
             c->is_gemma4 = 1;
-            /* Gemma 4 proportional RoPE for full attention layers:
-             * HuggingFace config has partial_rotary_factor=0.25 for full layers.
-             * GGUF rope.dimension_count=512 is the full head_dim, NOT the RoPE dim.
-             * Actual RoPE dims for full layers = full_head_dim * 0.25 = 128.
-             *
-             * Sliding layers: rope.dimension_count_swa=256 = full head_dim(256) → all rotated.
-             *
-             * We adjust rope_n_dims_full to reflect the partial rotation. */
-            if (c->rope_n_dims_full > 0 && c->full_head_dim > 0) {
-                /* partial_rotary_factor = 0.25 for Gemma 4 E2B/E4B */
-                int partial_rope = c->full_head_dim / 4;  /* 512/4 = 128 */
-                fprintf(stderr, "tq_load_gguf: Gemma4 p-RoPE — full layer RoPE dims %d -> %d "
-                        "(partial_rotary_factor=0.25)\n", c->rope_n_dims_full, partial_rope);
-                c->rope_n_dims_full = partial_rope;
-            }
-            fprintf(stderr, "tq_load_gguf: Gemma4 — RoPE dims swa=%d full=%d, "
-                    "GeGLU FFN, rope_freqs for full layers only\n",
-                    c->rope_n_dims, c->rope_n_dims_full);
+            /* Gemma 4 proportional RoPE: deferred to after hybrid attention
+             * detection sets full_head_dim (see below, ~line 12238). */
+            fprintf(stderr, "tq_load_gguf: Gemma4 detected (p-RoPE will be applied after hybrid detection)\n");
         }
         fprintf(stderr, "tq_load_gguf: Gemma family detected (sliding_window=%d)\n", c->sliding_window);
     } else if (c->is_moe) {
@@ -12253,6 +12238,17 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
     }
 
+    /* Gemma 4 proportional RoPE: NOW apply, after hybrid detection set full_head_dim.
+     * HuggingFace config: partial_rotary_factor=0.25 for full attention layers.
+     * GGUF rope.dimension_count=512 is the full head_dim, NOT the rotated dim.
+     * Actual RoPE dims for full layers = full_head_dim / 4 = 128. */
+    if (c->is_gemma4 && c->rope_n_dims_full > 0 && c->full_head_dim > 0) {
+        int partial_rope = c->full_head_dim / 4;  /* 512/4 = 128 */
+        fprintf(stderr, "tq_load_gguf: Gemma4 p-RoPE — full layer RoPE dims %d -> %d "
+                "(partial_rotary_factor=0.25)\n", c->rope_n_dims_full, partial_rope);
+        c->rope_n_dims_full = partial_rope;
+    }
+
     /* Load embedding + output weights */
     const tq_gguf_tensor_t* emb_t = find_gguf_tensor(gguf, "token_embd.weight");
     if (emb_t) {