Skip to content

Commit 4ce525b

Browse files
unamedkrclaude
andcommitted
fix(gemma4): proportional RoPE + layer_output_scale correction
R8: Proportional RoPE for full attention layers - GGUF rope.dimension_count=512 is the full head_dim, NOT the RoPE dim - Gemma 4 uses partial_rotary_factor=0.25 for full layers - Actual RoPE dims = full_head_dim * 0.25 = 128 (not 512) - Adjusted rope_n_dims_full accordingly R10: layer_output_scale — simple multiply (llama.cpp reference) - Previous: x = residual + los * (x - residual) — separated residual - Correct (llama.cpp gemma4-iswa.cpp): x *= los — simple elementwise - Added TQ_MAX_LAYERS debug env for per-layer diagnosis Still produces garbage — remaining candidates: - Residual connection order (pre-norm vs post-norm flow) - PLE gating uses gelu, not silu (llama.cpp confirms LLM_FFN_GELU) - output_gguf Q5_0 matmul accuracy for 262K vocab Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent dfd0a44 commit 4ce525b

1 file changed

Lines changed: 27 additions & 9 deletions

File tree

quant.h

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11597,11 +11597,23 @@ tq_model_t* tq_load_gguf(const char* path) {
1159711597
/* Gemma 4 (STEP35) detection: architecture string is "gemma4" */
1159811598
if (strstr(gguf->arch, "gemma4") != NULL) {
1159911599
c->is_gemma4 = 1;
11600-
/* Gemma 4: full attention layers use rope.dimension_count directly.
11601-
* Do NOT halve — split-source (tq_model.c) correctly keeps full=512.
11602-
* The /2 was a misport that caused garbage output. */
11600+
/* Gemma 4 proportional RoPE for full attention layers:
11601+
* HuggingFace config has partial_rotary_factor=0.25 for full layers.
11602+
* GGUF rope.dimension_count=512 is the full head_dim, NOT the RoPE dim.
11603+
* Actual RoPE dims for full layers = full_head_dim * 0.25 = 128.
11604+
*
11605+
* Sliding layers: rope.dimension_count_swa=256 = full head_dim(256) → all rotated.
11606+
*
11607+
* We adjust rope_n_dims_full to reflect the partial rotation. */
11608+
if (c->rope_n_dims_full > 0 && c->full_head_dim > 0) {
11609+
/* partial_rotary_factor = 0.25 for Gemma 4 E2B/E4B */
11610+
int partial_rope = c->full_head_dim / 4; /* 512/4 = 128 */
11611+
fprintf(stderr, "tq_load_gguf: Gemma4 p-RoPE — full layer RoPE dims %d -> %d "
11612+
"(partial_rotary_factor=0.25)\n", c->rope_n_dims_full, partial_rope);
11613+
c->rope_n_dims_full = partial_rope;
11614+
}
1160311615
fprintf(stderr, "tq_load_gguf: Gemma4 — RoPE dims swa=%d full=%d, "
11604-
"SiLU FFN, rope_freqs for full layers only\n",
11616+
"GeGLU FFN, rope_freqs for full layers only\n",
1160511617
c->rope_n_dims, c->rope_n_dims_full);
1160611618
}
1160711619
fprintf(stderr, "tq_load_gguf: Gemma family detected (sliding_window=%d)\n", c->sliding_window);
@@ -15321,7 +15333,12 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1532115333
/* Step 2: Transformer layers */
1532215334
int is_gemma3 = (c->model_type == 1);
1532315335

15324-
for (int l = 0; l < c->n_layers; l++) {
15336+
/* Debug: limit number of layers for diagnosing per-layer issues */
15337+
int max_layers = c->n_layers;
15338+
{ const char* ml = getenv("TQ_MAX_LAYERS");
15339+
if (ml) { int v = atoi(ml); if (v > 0 && v < max_layers) max_layers = v; } }
15340+
15341+
for (int l = 0; l < max_layers; l++) {
1532515342
tq_layer_weights_t* layer = &model->layers[l];
1532615343

1532715344
/* Save input residual for layer_output_scale (Gemma 4).
@@ -15584,11 +15601,12 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1558415601
tq_add(s->x, s->x, ple_proj_out, dim);
1558515602
}
1558615603

15587-
/* Gemma 4: layer_output_scale scales the layer's CONTRIBUTIONS (attn + ffn).
15588-
* Essential for controlling gradient flow — model was trained with these scales. */
15604+
/* Gemma 4: layer_output_scale — simple multiplication of entire output.
15605+
* llama.cpp reference (gemma4-iswa.cpp): cur = ggml_mul(cur, out_scale)
15606+
* Previous implementation incorrectly separated residual contribution.
15607+
* The correct approach is a straight elementwise multiply. */
1558915608
if (layer->layer_output_scale != 0.0f) {
1559015609
float los = layer->layer_output_scale;
15591-
/* Debug: print pre-scale values */
1559215610
if (pos == 0 && getenv("TQ_DEBUG") && l < 3) {
1559315611
float maxv = 0, minv = 0;
1559415612
for (int i = 0; i < dim; i++) {
@@ -15598,7 +15616,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1559815616
fprintf(stderr, "[DEBUG] layer%d pre_scale min=%.3f max=%.3f (los=%.4f)\n", l, minv, maxv, los);
1559915617
}
1560015618
for (int i = 0; i < dim; i++) {
15601-
s->x[i] = layer_residual_buf[i] + los * (s->x[i] - layer_residual_buf[i]);
15619+
s->x[i] *= los;
1560215620
}
1560315621
}
1560415622

0 commit comments

Comments
 (0)