@@ -1005,7 +1005,6 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
10051005 if (c -> partial_rotary_factor > 0.0f && c -> partial_rotary_factor < 1.0f ) {
10061006 /* Partial RoPE: only apply to first partial_rotary_factor * head_dim dims */
10071007 int rope_dim = (int )(c -> partial_rotary_factor * head_dim );
1008- /* Apply RoPE only to the first rope_dim dimensions of each head */
10091008 for (int h = 0 ; h < n_heads ; h ++ ) {
10101009 float * qh = s -> q + h * head_dim ;
10111010 for (int i = 0 ; i < rope_dim / 2 ; i ++ ) {
@@ -1032,28 +1031,68 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
10321031 kh [2 * i + 1 ] = k0 * sin_t + k1 * cos_t ;
10331032 }
10341033 }
1035- } else if (model -> rope_freqs && model -> rope_freqs_len > 0 ) {
1036- /* Learned RoPE frequencies (Gemma 4): use pre-computed inv_freq values.
1037- * rope_freqs has full_head_dim/2 entries (e.g., 256 for head_dim=512).
1038- * For sliding layers (head_dim=256), use the first 128 entries.
1039- * For full layers (head_dim=512), use all 256 entries. */
1040- int rope_pairs = head_dim / 2 ;
1034+ } else if (model -> rope_freqs && model -> rope_freqs_len > 0 &&
1035+ !(c -> is_gemma4 && model -> layer_is_sliding && model -> layer_is_sliding [l ])) {
1036+ /* Learned RoPE frequency factors (Gemma 4 / STEP35).
1037+ * Only used for FULL (global) attention layers. Sliding (SWA) layers
1038+ * use standard RoPE without freq_factors (matching llama.cpp STEP35).
1039+ *
1040+ * rope_freqs[i] is a frequency FACTOR (divisor) on the base frequency.
1041+ * theta[i] = pos * pow(base, -2*i/n_dims) / rope_freqs[i]
1042+ * where n_dims is the RoPE dimension count (NOT head_dim for full layers).
1043+ *
1044+ * For Gemma 4: n_dims = 256 for both sliding (head_dim=256) and full
1045+ * (head_dim=512) layers. This is because rope.dimension_count=512 gets
1046+ * halved for STEP35 (n_rot_full = 512/2 = 256), and
1047+ * rope.dimension_count_swa=256 for sliding layers.
1048+ *
1049+ * rope_freqs has up to full_head_dim/2 entries (256 for head_dim=512).
1050+ * For sliding layers (head_dim=256), use the first head_dim/2 entries.
1051+ * For full layers, n_dims < head_dim, so pairs beyond n_dims/2 are not
1052+ * rotated (left as-is). The freq_factors handle partial rotation within
1053+ * the rotated range (1.0 = rotate, 1e30 = effectively no rotation). */
1054+ float rope_base = c -> rope_freq_base ;
1055+ if (c -> model_type == 1 && c -> rope_local_base_freq > 0.0f &&
1056+ model -> layer_is_sliding && model -> layer_is_sliding [l ]) {
1057+ rope_base = c -> rope_local_base_freq ;
1058+ }
1059+
1060+ /* Determine RoPE n_dims for this layer type */
1061+ int is_full_layer = (model -> layer_is_sliding && !model -> layer_is_sliding [l ] &&
1062+ c -> full_head_dim > 0 );
1063+ int rope_n_dims ;
1064+ if (is_full_layer && c -> rope_n_dims_full > 0 ) {
1065+ rope_n_dims = c -> rope_n_dims_full ;
1066+ } else if (c -> rope_n_dims > 0 ) {
1067+ rope_n_dims = c -> rope_n_dims ;
1068+ } else {
1069+ rope_n_dims = head_dim ; /* fallback */
1070+ }
1071+ int rope_pairs = rope_n_dims / 2 ; /* pairs that get RoPE treatment */
1072+ if (rope_pairs > model -> rope_freqs_len )
1073+ rope_pairs = model -> rope_freqs_len ;
1074+
10411075 for (int h = 0 ; h < n_heads ; h ++ ) {
10421076 float * qh = s -> q + h * head_dim ;
1043- for (int i = 0 ; i < rope_pairs && i < model -> rope_freqs_len ; i ++ ) {
1044- float theta = pos * model -> rope_freqs [i ];
1077+ for (int i = 0 ; i < rope_pairs ; i ++ ) {
1078+ float base_freq = 1.0f / powf (rope_base , 2.0f * i / (float )rope_n_dims );
1079+ float freq = base_freq / model -> rope_freqs [i ];
1080+ float theta = pos * freq ;
10451081 float cos_t = cosf (theta );
10461082 float sin_t = sinf (theta );
10471083 float q0 = qh [2 * i ];
10481084 float q1 = qh [2 * i + 1 ];
10491085 qh [2 * i ] = q0 * cos_t - q1 * sin_t ;
10501086 qh [2 * i + 1 ] = q0 * sin_t + q1 * cos_t ;
10511087 }
1088+ /* Pairs beyond rope_pairs are left unrotated (pass-through) */
10521089 }
10531090 for (int h = 0 ; h < n_kv_heads ; h ++ ) {
10541091 float * kh = s -> k + h * head_dim ;
1055- for (int i = 0 ; i < rope_pairs && i < model -> rope_freqs_len ; i ++ ) {
1056- float theta = pos * model -> rope_freqs [i ];
1092+ for (int i = 0 ; i < rope_pairs ; i ++ ) {
1093+ float base_freq = 1.0f / powf (rope_base , 2.0f * i / (float )rope_n_dims );
1094+ float freq = base_freq / model -> rope_freqs [i ];
1095+ float theta = pos * freq ;
10571096 float cos_t = cosf (theta );
10581097 float sin_t = sinf (theta );
10591098 float k0 = kh [2 * i ];
@@ -1481,12 +1520,23 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
14811520 }
14821521 }
14831522
1484- /* Attention logit soft-capping (Gemma 2/3/4): cap * tanh(score / cap) */
1523+ /* Attention logit soft-capping (Gemma 2/3/4): cap * tanh(score / cap)
1524+ * Important: softcap applies to RAW (unscaled) scores. The 1/sqrt(d)
1525+ * scaling must be applied AFTER softcap, before softmax.
1526+ * This matches llama.cpp's approach: softcap(Q*K^T) * scale → softmax.
1527+ *
1528+ * When softcap is disabled, scores already have scale applied inline
1529+ * (score * inv_scale), so no extra work needed. */
14851530 if (c -> attn_logit_softcap > 0.0f ) {
14861531 float cap = c -> attn_logit_softcap ;
14871532 float inv_cap = 1.0f / cap ;
1533+ float inv_scale = 1.0f / sqrtf (attn_scale_dim );
14881534 for (int t = attn_start ; t < seq_len ; t ++ ) {
1489- atth [t ] = cap * tanhf (atth [t ] * inv_cap );
1535+ /* atth[t] currently has score * inv_scale (scaled).
1536+ * Undo the scale, apply softcap, then re-apply scale. */
1537+ float raw = atth [t ] / inv_scale ; /* undo: raw score */
1538+ float capped = cap * tanhf (raw * inv_cap );
1539+ atth [t ] = capped * inv_scale ;
14901540 }
14911541 }
14921542
@@ -1774,6 +1824,17 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
17741824 tq_matmul (s -> xb2 , s -> xb , layer -> wo , dim , n_heads * head_dim );
17751825 TQ_PROF_STOP (_tp , matmul_ns );
17761826
1827+ /* Debug: print attention output before residual add */
1828+ if (pos == 0 && getenv ("TQ_DEBUG" ) && l < 3 ) {
1829+ float maxv = 0 , minv = 0 ;
1830+ for (int i = 0 ; i < dim ; i ++ ) {
1831+ if (s -> xb2 [i ] > maxv ) maxv = s -> xb2 [i ];
1832+ if (s -> xb2 [i ] < minv ) minv = s -> xb2 [i ];
1833+ }
1834+ fprintf (stderr , "[DEBUG] layer%d attn_out min=%.3f max=%.3f (hd=%d, nh=%d, nkv=%d)\n" ,
1835+ l , minv , maxv , head_dim , n_heads , n_kv_heads );
1836+ }
1837+
17771838 /* Residual */
17781839 tq_add (s -> x , s -> x , s -> xb2 , dim );
17791840}
@@ -1962,7 +2023,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
19622023 s -> xb , s -> xb2 , dim , l );
19632024 TQ_PROF_STOP (_tp , moe_ns );
19642025
1965- /* Gemma 4 : MoE output uses post_ffw_norm_1, else fallback to post_ffn_norm */
2026+ /* Gemma: MoE output uses post_ffw_norm if present. */
19662027 if (is_gemma3 ) {
19672028 float * moe_post_norm = layer -> post_ffn_norm_1 ? layer -> post_ffn_norm_1 : layer -> post_ffn_norm ;
19682029 if (moe_post_norm )
@@ -1972,12 +2033,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
19722033 tq_add (s -> x , s -> x , s -> xb2 , dim );
19732034 did_moe = 1 ;
19742035 }
1975- /* Dense FFN path — SwiGLU (Qwen3.5) or GeGLU (Gemma3).
1976- * For Gemma 4: runs BOTH MoE AND dense FFN (shared expert) per layer.
1977- * Optimization: cache Q8 quantization of xb for gate+up projections,
1978- * and cache Q8 of hb for down projection. */
1979- /* Dense FFN: run for non-MoE layers, or for Gemma 4 MoE layers that also have dense FFN */
1980- if ((!did_moe || (is_gemma3 && did_moe )) &&
2036+ /* Dense FFN path — SwiGLU (Qwen3.5, Gemma4/STEP35) or GeGLU (Gemma3).
2037+ * For Gemma 4 STEP35: layers are either MoE or dense, NOT both.
2038+ * For Gemma 3: runs both MoE and dense FFN (shared expert) per layer. */
2039+ /* Dense FFN: run for non-MoE layers, or for Gemma 3 MoE layers with dense FFN */
2040+ if ((!did_moe || (is_gemma3 && !c -> is_gemma4 && did_moe )) &&
19812041 (layer -> w_gate || layer -> w_gate_q8 || layer -> w_gate_q4 || layer -> w_gate_q2 || layer -> gguf_w_gate ) &&
19822042 (layer -> w_up || layer -> w_up_q8 || layer -> w_up_q4 || layer -> w_up_q2 || layer -> gguf_w_up ) &&
19832043 (layer -> w_down || layer -> w_down_q8 || layer -> w_down_q4 || layer -> w_down_q2 || layer -> gguf_w_down )) {
@@ -2047,7 +2107,10 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
20472107
20482108 TQ_PROF_STOP (_tp , matmul_ns );
20492109
2050- /* Activation: GeGLU for Gemma3, SwiGLU for others */
2110+ /* Activation: GeGLU for Gemma3/4, SwiGLU for others.
2111+ * Note: Gemma 4 (STEP35) uses GeGLU (gated GELU), same as Gemma 3.
2112+ * The llama.cpp STEP35 code uses LLM_FFN_SILU which might be incorrect
2113+ * for the E2B model. The HuggingFace Gemma4 config uses gelu_pytorch_tanh. */
20512114 if (is_gemma3 ) {
20522115 tq_gelu_tanh (s -> hb , inter );
20532116 } else {
@@ -2069,7 +2132,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
20692132 }
20702133 TQ_PROF_STOP (_tp , matmul_ns );
20712134
2072- /* Gemma: apply post-FFN norm. For dual-FFN, use post_ffw_norm_2 for dense . */
2135+ /* Gemma: apply post-FFN norm if present . */
20732136 if (is_gemma3 ) {
20742137 float * dense_post_norm = NULL ;
20752138 if (did_moe && layer -> post_ffn_norm_2 )
@@ -2128,21 +2191,35 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
21282191 tq_add (s -> x , s -> x , ple_proj_out , dim );
21292192 }
21302193
2131- /* Gemma 4: layer_output_scale scales the layer's CONTRIBUTIONS (attn + ffn + ple),
2132- * not the entire hidden state. Formula:
2133- * x_new = x_old + scale * (x_current - x_old) */
2194+ /* Gemma 4: layer_output_scale scales the layer's CONTRIBUTIONS (attn + ffn).
2195+ * Essential for controlling gradient flow — model was trained with these scales. */
21342196 if (layer -> layer_output_scale != 0.0f ) {
21352197 float los = layer -> layer_output_scale ;
2198+ /* Debug: print pre-scale values */
2199+ if (pos == 0 && getenv ("TQ_DEBUG" ) && l < 3 ) {
2200+ float maxv = 0 , minv = 0 ;
2201+ for (int i = 0 ; i < dim ; i ++ ) {
2202+ if (s -> x [i ] > maxv ) maxv = s -> x [i ];
2203+ if (s -> x [i ] < minv ) minv = s -> x [i ];
2204+ }
2205+ fprintf (stderr , "[DEBUG] layer%d pre_scale min=%.3f max=%.3f (los=%.4f)\n" , l , minv , maxv , los );
2206+ }
21362207 for (int i = 0 ; i < dim ; i ++ ) {
21372208 s -> x [i ] = layer_residual_buf [i ] + los * (s -> x [i ] - layer_residual_buf [i ]);
21382209 }
21392210 }
21402211
21412212 /* Debug: print layer output */
2142- if (pos == 0 && getenv ("TQ_DEBUG" ) && (l == 0 || l == 5 || l == c -> n_layers - 1 )) {
2143- fprintf (stderr , "[DEBUG] layer%d out[0:8] = " , l );
2144- for (int i = 0 ; i < 8 && i < dim ; i ++ ) fprintf (stderr , "%.4f " , s -> x [i ]);
2145- fprintf (stderr , "\n" );
2213+ if (pos == 0 && getenv ("TQ_DEBUG" )) {
2214+ if (l < 10 || l == c -> n_layers - 1 || getenv ("TQ_DEBUG_ALL" )) {
2215+ float maxv = 0 , minv = 0 ;
2216+ for (int i = 0 ; i < dim ; i ++ ) {
2217+ if (s -> x [i ] > maxv ) maxv = s -> x [i ];
2218+ if (s -> x [i ] < minv ) minv = s -> x [i ];
2219+ }
2220+ fprintf (stderr , "[DEBUG] layer%d out[0:4]=%.3f,%.3f,%.3f,%.3f min=%.3f max=%.3f los=%.4f\n" ,
2221+ l , s -> x [0 ], s -> x [1 ], s -> x [2 ], s -> x [3 ], minv , maxv , layer -> layer_output_scale );
2222+ }
21462223 }
21472224 }
21482225
0 commit comments