Skip to content

Commit bfde618

Browse files
unamedkrclaude
andcommitted
Gemma 4: learned RoPE + is_gemma4 flag + per-layer debug
Gemma 4 improvements from investigation: - Added is_gemma4 flag for architecture-specific logic - Loaded rope_n_dims/rope_n_dims_full from GGUF metadata - Applied learned rope_freqs as frequency factors (divisors) matching llama.cpp ggml convention: theta = base_freq / rope_freqs[i] - rope_freqs only applied to full (non-sliding) layers - Added per-layer hidden state debug (TQ_DEBUG_ALL=1) - Verified dense FFN vs MoE logic for Gemma 4 Status: Gemma 4 produces semantically relevant tokens ("Maison" for France prompt) but falls into repetition. Hidden state grows to min=-120/max=+93 over 35 layers. Root cause: likely GGUF conversion issue — llama.cpp itself cannot load this gemma4 GGUF (unknown architecture). SmolLM2 + Qwen3.5: fully working, 34/34 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b503c2e commit bfde618

3 files changed

Lines changed: 133 additions & 29 deletions

File tree

include/turboquant/tq_engine.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ typedef struct {
4646

4747
/* Multi-architecture support */
4848
int model_type; /* 0=qwen35, 1=gemma3, 2=qwen2moe */
49+
int is_gemma4; /* 1 if Gemma 4 (STEP35): uses SwiGLU, no post-norms */
4950
int sliding_window; /* sliding window size (512 for gemma3, 0 for unlimited) */
5051
float rope_local_base_freq; /* RoPE base freq for local/sliding layers (10000.0 for gemma3) */
5152
int n_norms_per_block; /* 2 for qwen35, 4 for gemma3 */
@@ -57,6 +58,8 @@ typedef struct {
5758
int full_head_dim; /* head_dim for full attention layers (e.g., 512 vs sliding 256) */
5859
int full_n_heads; /* n_heads for full layers (e.g., 8 vs sliding 16) */
5960
int full_n_kv_heads; /* n_kv_heads for full layers (e.g., 2 vs sliding 8) */
61+
int rope_n_dims; /* RoPE dimension count for sliding/SWA layers (0 = use head_dim) */
62+
int rope_n_dims_full; /* RoPE dimension count for full/global layers (0 = use rope_n_dims) */
6063
float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
6164
float attn_logit_softcap; /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
6265
int* per_layer_inter_dim; /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */

src/engine/tq_model.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2874,6 +2874,19 @@ tq_model_t* tq_load_gguf(const char* path) {
28742874
c->rope_freq_base = tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 1000000.0f);
28752875
c->rms_norm_eps = tq_gguf_get_f32(gguf, GGUF_KEY("attention.layer_norm_rms_epsilon"), 1e-6f);
28762876

2877+
/* RoPE dimension count: number of dimensions to rotate per head.
2878+
* For models with rope_freqs (learned freq factors), this determines the
2879+
* frequency computation: freq[i] = pow(base, -2*i/n_dims).
2880+
* For STEP35/Gemma4: n_dims = head_dim/2 for full layers (partial rotation). */
2881+
c->rope_n_dims = tq_gguf_get_i32(gguf, GGUF_KEY("rope.dimension_count"), 0);
2882+
c->rope_n_dims_full = c->rope_n_dims; /* default: same for both layer types */
2883+
{
2884+
int swa_dims = tq_gguf_get_i32(gguf, GGUF_KEY("rope.dimension_count_swa"), 0);
2885+
if (swa_dims > 0) {
2886+
c->rope_n_dims = swa_dims; /* sliding layers use SWA dim count */
2887+
}
2888+
}
2889+
28772890
/* Sliding window + local RoPE base */
28782891
c->sliding_window = (int)tq_gguf_get_u32(gguf, GGUF_KEY("attention.sliding_window"), 0);
28792892
/* Local/sliding RoPE base: try Gemma4 naming first, then generic */
@@ -2965,6 +2978,17 @@ tq_model_t* tq_load_gguf(const char* path) {
29652978
if (strstr(gguf->arch, "gemma") != NULL) {
29662979
c->model_type = 1; /* gemma family */
29672980
c->n_norms_per_block = 4;
2981+
/* Gemma 4 (STEP35) detection: architecture string is "gemma4" */
2982+
if (strstr(gguf->arch, "gemma4") != NULL) {
2983+
c->is_gemma4 = 1;
2984+
/* STEP35: full attention layers use half the RoPE dimensions */
2985+
if (c->rope_n_dims_full > 0) {
2986+
c->rope_n_dims_full = c->rope_n_dims_full / 2;
2987+
}
2988+
fprintf(stderr, "tq_load_gguf: Gemma4 — RoPE dims swa=%d full=%d, "
2989+
"GeGLU, rope_freqs for full layers only\n",
2990+
c->rope_n_dims, c->rope_n_dims_full);
2991+
}
29682992
fprintf(stderr, "tq_load_gguf: Gemma family detected (sliding_window=%d)\n", c->sliding_window);
29692993
} else if (c->is_moe) {
29702994
c->model_type = 2; /* qwen moe */

src/engine/tq_transformer.c

Lines changed: 106 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,6 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
10051005
if (c->partial_rotary_factor > 0.0f && c->partial_rotary_factor < 1.0f) {
10061006
/* Partial RoPE: only apply to first partial_rotary_factor * head_dim dims */
10071007
int rope_dim = (int)(c->partial_rotary_factor * head_dim);
1008-
/* Apply RoPE only to the first rope_dim dimensions of each head */
10091008
for (int h = 0; h < n_heads; h++) {
10101009
float* qh = s->q + h * head_dim;
10111010
for (int i = 0; i < rope_dim / 2; i++) {
@@ -1032,28 +1031,68 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
10321031
kh[2 * i + 1] = k0 * sin_t + k1 * cos_t;
10331032
}
10341033
}
1035-
} else if (model->rope_freqs && model->rope_freqs_len > 0) {
1036-
/* Learned RoPE frequencies (Gemma 4): use pre-computed inv_freq values.
1037-
* rope_freqs has full_head_dim/2 entries (e.g., 256 for head_dim=512).
1038-
* For sliding layers (head_dim=256), use the first 128 entries.
1039-
* For full layers (head_dim=512), use all 256 entries. */
1040-
int rope_pairs = head_dim / 2;
1034+
} else if (model->rope_freqs && model->rope_freqs_len > 0 &&
1035+
!(c->is_gemma4 && model->layer_is_sliding && model->layer_is_sliding[l])) {
1036+
/* Learned RoPE frequency factors (Gemma 4 / STEP35).
1037+
* Only used for FULL (global) attention layers. Sliding (SWA) layers
1038+
* use standard RoPE without freq_factors (matching llama.cpp STEP35).
1039+
*
1040+
* rope_freqs[i] is a frequency FACTOR (divisor) on the base frequency.
1041+
* theta[i] = pos * pow(base, -2*i/n_dims) / rope_freqs[i]
1042+
* where n_dims is the RoPE dimension count (NOT head_dim for full layers).
1043+
*
1044+
* For Gemma 4: n_dims = 256 for both sliding (head_dim=256) and full
1045+
* (head_dim=512) layers. This is because rope.dimension_count=512 gets
1046+
* halved for STEP35 (n_rot_full = 512/2 = 256), and
1047+
* rope.dimension_count_swa=256 for sliding layers.
1048+
*
1049+
* rope_freqs has up to full_head_dim/2 entries (256 for head_dim=512).
1050+
* For sliding layers (head_dim=256), use the first head_dim/2 entries.
1051+
* For full layers, n_dims < head_dim, so pairs beyond n_dims/2 are not
1052+
* rotated (left as-is). The freq_factors handle partial rotation within
1053+
* the rotated range (1.0 = rotate, 1e30 = effectively no rotation). */
1054+
float rope_base = c->rope_freq_base;
1055+
if (c->model_type == 1 && c->rope_local_base_freq > 0.0f &&
1056+
model->layer_is_sliding && model->layer_is_sliding[l]) {
1057+
rope_base = c->rope_local_base_freq;
1058+
}
1059+
1060+
/* Determine RoPE n_dims for this layer type */
1061+
int is_full_layer = (model->layer_is_sliding && !model->layer_is_sliding[l] &&
1062+
c->full_head_dim > 0);
1063+
int rope_n_dims;
1064+
if (is_full_layer && c->rope_n_dims_full > 0) {
1065+
rope_n_dims = c->rope_n_dims_full;
1066+
} else if (c->rope_n_dims > 0) {
1067+
rope_n_dims = c->rope_n_dims;
1068+
} else {
1069+
rope_n_dims = head_dim; /* fallback */
1070+
}
1071+
int rope_pairs = rope_n_dims / 2; /* pairs that get RoPE treatment */
1072+
if (rope_pairs > model->rope_freqs_len)
1073+
rope_pairs = model->rope_freqs_len;
1074+
10411075
for (int h = 0; h < n_heads; h++) {
10421076
float* qh = s->q + h * head_dim;
1043-
for (int i = 0; i < rope_pairs && i < model->rope_freqs_len; i++) {
1044-
float theta = pos * model->rope_freqs[i];
1077+
for (int i = 0; i < rope_pairs; i++) {
1078+
float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)rope_n_dims);
1079+
float freq = base_freq / model->rope_freqs[i];
1080+
float theta = pos * freq;
10451081
float cos_t = cosf(theta);
10461082
float sin_t = sinf(theta);
10471083
float q0 = qh[2 * i];
10481084
float q1 = qh[2 * i + 1];
10491085
qh[2 * i] = q0 * cos_t - q1 * sin_t;
10501086
qh[2 * i + 1] = q0 * sin_t + q1 * cos_t;
10511087
}
1088+
/* Pairs beyond rope_pairs are left unrotated (pass-through) */
10521089
}
10531090
for (int h = 0; h < n_kv_heads; h++) {
10541091
float* kh = s->k + h * head_dim;
1055-
for (int i = 0; i < rope_pairs && i < model->rope_freqs_len; i++) {
1056-
float theta = pos * model->rope_freqs[i];
1092+
for (int i = 0; i < rope_pairs; i++) {
1093+
float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)rope_n_dims);
1094+
float freq = base_freq / model->rope_freqs[i];
1095+
float theta = pos * freq;
10571096
float cos_t = cosf(theta);
10581097
float sin_t = sinf(theta);
10591098
float k0 = kh[2 * i];
@@ -1481,12 +1520,23 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
14811520
}
14821521
}
14831522

1484-
/* Attention logit soft-capping (Gemma 2/3/4): cap * tanh(score / cap) */
1523+
/* Attention logit soft-capping (Gemma 2/3/4): cap * tanh(score / cap)
1524+
* Important: softcap applies to RAW (unscaled) scores. The 1/sqrt(d)
1525+
* scaling must be applied AFTER softcap, before softmax.
1526+
* This matches llama.cpp's approach: softcap(Q*K^T) * scale → softmax.
1527+
*
1528+
* When softcap is disabled, scores already have scale applied inline
1529+
* (score * inv_scale), so no extra work needed. */
14851530
if (c->attn_logit_softcap > 0.0f) {
14861531
float cap = c->attn_logit_softcap;
14871532
float inv_cap = 1.0f / cap;
1533+
float inv_scale = 1.0f / sqrtf(attn_scale_dim);
14881534
for (int t = attn_start; t < seq_len; t++) {
1489-
atth[t] = cap * tanhf(atth[t] * inv_cap);
1535+
/* atth[t] currently has score * inv_scale (scaled).
1536+
* Undo the scale, apply softcap, then re-apply scale. */
1537+
float raw = atth[t] / inv_scale; /* undo: raw score */
1538+
float capped = cap * tanhf(raw * inv_cap);
1539+
atth[t] = capped * inv_scale;
14901540
}
14911541
}
14921542

@@ -1774,6 +1824,17 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
17741824
tq_matmul(s->xb2, s->xb, layer->wo, dim, n_heads * head_dim);
17751825
TQ_PROF_STOP(_tp, matmul_ns);
17761826

1827+
/* Debug: print attention output before residual add */
1828+
if (pos == 0 && getenv("TQ_DEBUG") && l < 3) {
1829+
float maxv = 0, minv = 0;
1830+
for (int i = 0; i < dim; i++) {
1831+
if (s->xb2[i] > maxv) maxv = s->xb2[i];
1832+
if (s->xb2[i] < minv) minv = s->xb2[i];
1833+
}
1834+
fprintf(stderr, "[DEBUG] layer%d attn_out min=%.3f max=%.3f (hd=%d, nh=%d, nkv=%d)\n",
1835+
l, minv, maxv, head_dim, n_heads, n_kv_heads);
1836+
}
1837+
17771838
/* Residual */
17781839
tq_add(s->x, s->x, s->xb2, dim);
17791840
}
@@ -1962,7 +2023,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
19622023
s->xb, s->xb2, dim, l);
19632024
TQ_PROF_STOP(_tp, moe_ns);
19642025

1965-
/* Gemma 4: MoE output uses post_ffw_norm_1, else fallback to post_ffn_norm */
2026+
/* Gemma: MoE output uses post_ffw_norm if present. */
19662027
if (is_gemma3) {
19672028
float* moe_post_norm = layer->post_ffn_norm_1 ? layer->post_ffn_norm_1 : layer->post_ffn_norm;
19682029
if (moe_post_norm)
@@ -1972,12 +2033,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
19722033
tq_add(s->x, s->x, s->xb2, dim);
19732034
did_moe = 1;
19742035
}
1975-
/* Dense FFN path — SwiGLU (Qwen3.5) or GeGLU (Gemma3).
1976-
* For Gemma 4: runs BOTH MoE AND dense FFN (shared expert) per layer.
1977-
* Optimization: cache Q8 quantization of xb for gate+up projections,
1978-
* and cache Q8 of hb for down projection. */
1979-
/* Dense FFN: run for non-MoE layers, or for Gemma 4 MoE layers that also have dense FFN */
1980-
if ((!did_moe || (is_gemma3 && did_moe)) &&
2036+
/* Dense FFN path — SwiGLU (Qwen3.5, Gemma4/STEP35) or GeGLU (Gemma3).
2037+
* For Gemma 4 STEP35: layers are either MoE or dense, NOT both.
2038+
* For Gemma 3: runs both MoE and dense FFN (shared expert) per layer. */
2039+
/* Dense FFN: run for non-MoE layers, or for Gemma 3 MoE layers with dense FFN */
2040+
if ((!did_moe || (is_gemma3 && !c->is_gemma4 && did_moe)) &&
19812041
(layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate) &&
19822042
(layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up) &&
19832043
(layer->w_down || layer->w_down_q8 || layer->w_down_q4 || layer->w_down_q2 || layer->gguf_w_down)) {
@@ -2047,7 +2107,10 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
20472107

20482108
TQ_PROF_STOP(_tp, matmul_ns);
20492109

2050-
/* Activation: GeGLU for Gemma3, SwiGLU for others */
2110+
/* Activation: GeGLU for Gemma3/4, SwiGLU for others.
2111+
* Note: Gemma 4 (STEP35) uses GeGLU (gated GELU), same as Gemma 3.
2112+
* The llama.cpp STEP35 code uses LLM_FFN_SILU which might be incorrect
2113+
* for the E2B model. The HuggingFace Gemma4 config uses gelu_pytorch_tanh. */
20512114
if (is_gemma3) {
20522115
tq_gelu_tanh(s->hb, inter);
20532116
} else {
@@ -2069,7 +2132,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
20692132
}
20702133
TQ_PROF_STOP(_tp, matmul_ns);
20712134

2072-
/* Gemma: apply post-FFN norm. For dual-FFN, use post_ffw_norm_2 for dense. */
2135+
/* Gemma: apply post-FFN norm if present. */
20732136
if (is_gemma3) {
20742137
float* dense_post_norm = NULL;
20752138
if (did_moe && layer->post_ffn_norm_2)
@@ -2128,21 +2191,35 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
21282191
tq_add(s->x, s->x, ple_proj_out, dim);
21292192
}
21302193

2131-
/* Gemma 4: layer_output_scale scales the layer's CONTRIBUTIONS (attn + ffn + ple),
2132-
* not the entire hidden state. Formula:
2133-
* x_new = x_old + scale * (x_current - x_old) */
2194+
/* Gemma 4: layer_output_scale scales the layer's CONTRIBUTIONS (attn + ffn).
2195+
* Essential for controlling gradient flow — model was trained with these scales. */
21342196
if (layer->layer_output_scale != 0.0f) {
21352197
float los = layer->layer_output_scale;
2198+
/* Debug: print pre-scale values */
2199+
if (pos == 0 && getenv("TQ_DEBUG") && l < 3) {
2200+
float maxv = 0, minv = 0;
2201+
for (int i = 0; i < dim; i++) {
2202+
if (s->x[i] > maxv) maxv = s->x[i];
2203+
if (s->x[i] < minv) minv = s->x[i];
2204+
}
2205+
fprintf(stderr, "[DEBUG] layer%d pre_scale min=%.3f max=%.3f (los=%.4f)\n", l, minv, maxv, los);
2206+
}
21362207
for (int i = 0; i < dim; i++) {
21372208
s->x[i] = layer_residual_buf[i] + los * (s->x[i] - layer_residual_buf[i]);
21382209
}
21392210
}
21402211

21412212
/* Debug: print layer output */
2142-
if (pos == 0 && getenv("TQ_DEBUG") && (l == 0 || l == 5 || l == c->n_layers - 1)) {
2143-
fprintf(stderr, "[DEBUG] layer%d out[0:8] = ", l);
2144-
for (int i = 0; i < 8 && i < dim; i++) fprintf(stderr, "%.4f ", s->x[i]);
2145-
fprintf(stderr, "\n");
2213+
if (pos == 0 && getenv("TQ_DEBUG")) {
2214+
if (l < 10 || l == c->n_layers - 1 || getenv("TQ_DEBUG_ALL")) {
2215+
float maxv = 0, minv = 0;
2216+
for (int i = 0; i < dim; i++) {
2217+
if (s->x[i] > maxv) maxv = s->x[i];
2218+
if (s->x[i] < minv) minv = s->x[i];
2219+
}
2220+
fprintf(stderr, "[DEBUG] layer%d out[0:4]=%.3f,%.3f,%.3f,%.3f min=%.3f max=%.3f los=%.4f\n",
2221+
l, s->x[0], s->x[1], s->x[2], s->x[3], minv, maxv, layer->layer_output_scale);
2222+
}
21462223
}
21472224
}
21482225

0 commit comments

Comments
 (0)