@@ -1926,15 +1926,22 @@ static size_t calc_q4_buffer_size(const tq_model_t* model) {
19261926 int delta_z_dim = c -> delta_n_heads * c -> delta_value_head_dim ;
19271927 int delta_dn = c -> delta_n_heads ;
19281928
1929+ int full_q_dim = (c -> full_n_heads > 0 && c -> full_head_dim > 0 )
1930+ ? c -> full_n_heads * c -> full_head_dim : q_dim ;
1931+
19291932 for (int l = 0 ; l < c -> n_layers ; l ++ ) {
19301933 const tq_layer_weights_t * layer = & model -> layers [l ];
1931- int lkv = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]) ? full_kv_dim : kv_dim ;
1934+ int is_full = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]);
1935+ int lkv = is_full ? full_kv_dim : kv_dim ;
1936+ int lq = is_full ? full_q_dim : q_dim ;
1937+ int lqg = qg_dim ; /* with gate: lq*2, without: lq */
1938+ if (is_full ) lqg = c -> attn_output_gate ? lq * 2 : lq ;
19321939
19331940 /* Self-attention weights */
19341941 if (layer -> wq ) {
19351942 int nb = (dim + 31 ) / 32 ;
1936- total += (size_t )qg_dim * nb * 16 ; /* packed Q4 data */
1937- total += (size_t )qg_dim * nb * 4 ; /* float scales */
1943+ total += (size_t )lqg * nb * 16 ; /* packed Q4 data */
1944+ total += (size_t )lqg * nb * 4 ; /* float scales */
19381945 }
19391946 if (layer -> wk ) {
19401947 int nb = (dim + 31 ) / 32 ;
@@ -1947,7 +1954,7 @@ static size_t calc_q4_buffer_size(const tq_model_t* model) {
19471954 total += (size_t )lkv * nb * 4 ;
19481955 }
19491956 if (layer -> wo ) {
1950- int nb = (q_dim + 31 ) / 32 ;
1957+ int nb = (lq + 31 ) / 32 ;
19511958 total += (size_t )dim * nb * 16 ;
19521959 total += (size_t )dim * nb * 4 ;
19531960 }
@@ -2026,12 +2033,18 @@ void tq_quantize_weights_q4(tq_model_t* model) {
20262033 }
20272034 size_t used = 0 ;
20282035
2036+ int full_q_dim = (c -> full_n_heads > 0 && c -> full_head_dim > 0 )
2037+ ? c -> full_n_heads * c -> full_head_dim : q_dim ;
2038+
20292039 for (int l = 0 ; l < c -> n_layers ; l ++ ) {
20302040 tq_layer_weights_t * layer = & model -> layers [l ];
2031- int lkv = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]) ? full_kv_dim : kv_dim ;
2041+ int is_full = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]);
2042+ int lkv = is_full ? full_kv_dim : kv_dim ;
2043+ int lq = is_full ? full_q_dim : q_dim ;
2044+ int lqg = c -> attn_output_gate ? lq * 2 : lq ;
20322045
20332046 /* Self-attention */
2034- quantize_matrix_q4 (layer -> wq , qg_dim , dim ,
2047+ quantize_matrix_q4 (layer -> wq , lqg , dim ,
20352048 & layer -> wq_q4 , & layer -> wq_q4s , & buf , & used );
20362049 if (layer -> wq_q4 ) layer -> wq = NULL ;
20372050
@@ -2043,7 +2056,7 @@ void tq_quantize_weights_q4(tq_model_t* model) {
20432056 & layer -> wv_q4 , & layer -> wv_q4s , & buf , & used );
20442057 if (layer -> wv_q4 ) layer -> wv = NULL ;
20452058
2046- quantize_matrix_q4 (layer -> wo , dim , q_dim ,
2059+ quantize_matrix_q4 (layer -> wo , dim , lq ,
20472060 & layer -> wo_q4 , & layer -> wo_q4s , & buf , & used );
20482061 if (layer -> wo_q4 ) layer -> wo = NULL ;
20492062
@@ -2841,6 +2854,7 @@ tq_model_t* tq_load_gguf(const char* path) {
28412854 c -> rope_local_base_freq = tq_gguf_get_f32 (gguf , GGUF_KEY ("rope.freq_base_swa" ),
28422855 tq_gguf_get_f32 (gguf , GGUF_KEY ("rope.local.freq_base" ),
28432856 tq_gguf_get_f32 (gguf , GGUF_KEY ("rope.freq_base" ), 10000.0f )));
2857+ c -> final_logit_softcap = tq_gguf_get_f32 (gguf , GGUF_KEY ("final_logit_softcapping" ), 0.0f );
28442858
28452859 /* Cap context for memory safety on small machines.
28462860 * GGUF models often claim 262K context but we cap at 4096 by default.
@@ -3039,6 +3053,26 @@ tq_model_t* tq_load_gguf(const char* path) {
30393053 for (int i = 0 ; i < c -> hidden_dim ; i ++ ) layer -> pre_ffn_norm [i ] += 1.0f ;
30403054 }
30413055
3056+ /* Gemma 4 dual-FFN extra norms */
3057+ snprintf (tname , sizeof (tname ), "blk.%d.post_ffw_norm_1.weight" , l );
3058+ t = find_gguf_tensor (gguf , tname );
3059+ if (t ) {
3060+ layer -> post_ffn_norm_1 = dequant_tensor_fp32 (t );
3061+ for (int i = 0 ; i < c -> hidden_dim ; i ++ ) layer -> post_ffn_norm_1 [i ] += 1.0f ;
3062+ }
3063+ snprintf (tname , sizeof (tname ), "blk.%d.pre_ffw_norm_2.weight" , l );
3064+ t = find_gguf_tensor (gguf , tname );
3065+ if (t ) {
3066+ layer -> pre_ffn_norm_2 = dequant_tensor_fp32 (t );
3067+ for (int i = 0 ; i < c -> hidden_dim ; i ++ ) layer -> pre_ffn_norm_2 [i ] += 1.0f ;
3068+ }
3069+ snprintf (tname , sizeof (tname ), "blk.%d.post_ffw_norm_2.weight" , l );
3070+ t = find_gguf_tensor (gguf , tname );
3071+ if (t ) {
3072+ layer -> post_ffn_norm_2 = dequant_tensor_fp32 (t );
3073+ for (int i = 0 ; i < c -> hidden_dim ; i ++ ) layer -> post_ffn_norm_2 [i ] += 1.0f ;
3074+ }
3075+
30423076 /* Gemma 4: layer_output_scale (scalar per layer) */
30433077 snprintf (tname , sizeof (tname ), "blk.%d.layer_output_scale.weight" , l );
30443078 t = find_gguf_tensor (gguf , tname );
@@ -3215,6 +3249,13 @@ tq_model_t* tq_load_gguf(const char* path) {
32153249 /* Router weights (small, always dequant to FP32) */
32163250 moe -> router_weight = dequant_tensor_fp32 (t );
32173251
3252+ /* Router input scale (Gemma 4): per-feature scaling before routing */
3253+ snprintf (tname , sizeof (tname ), "blk.%d.ffn_gate_inp.scale" , l );
3254+ t = find_gguf_tensor (gguf , tname );
3255+ if (t && t -> type == TQ_GGML_TYPE_F32 ) {
3256+ moe -> router_input_scale = (const float * )t -> data ;
3257+ }
3258+
32183259 /* Expert weights: shape [num_experts, expert_dim, hidden_dim]
32193260 * For GGUF, these are stored as 3D tensors. Each expert's
32203261 * weights are a contiguous slice within the tensor. */
@@ -3394,8 +3435,9 @@ tq_model_t* tq_load_gguf(const char* path) {
33943435 c -> full_n_kv_heads = c -> n_kv_heads ;
33953436 }
33963437 }
3397- /* Q dim is n_heads * head_dim (NOT hidden_dim). It's constant across layers. */
3398- c -> full_n_heads = (c -> n_heads * c -> head_dim ) / c -> full_head_dim ;
3438+ /* n_heads is constant across layers (16 for Gemma 4).
3439+ * Full layers: same n_heads but larger head_dim → Q dim doubles. */
3440+ c -> full_n_heads = c -> n_heads ;
33993441 fprintf (stderr , "tq_load_gguf: Gemma hybrid — %d sliding (hd=%d, kv=%d) + "
34003442 "%d full (hd=%d, kv=%d, heads=%d) attention layers\n" ,
34013443 n_sliding , c -> head_dim , c -> n_kv_heads ,
@@ -3479,15 +3521,18 @@ tq_model_t* tq_load_gguf(const char* path) {
34793521 size_t est_fp32 = 0 ;
34803522 for (int l = 0 ; l < c -> n_layers ; l ++ ) {
34813523 const tq_layer_weights_t * layer = & model -> layers [l ];
3482- int lkv = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]) ? full_kv_dim : kv_dim ;
3524+ int is_full_l = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]);
3525+ int lkv = is_full_l ? full_kv_dim : kv_dim ;
3526+ int lq = is_full_l ? (c -> full_n_heads * c -> full_head_dim ) : q_dim ;
3527+ int lqg = c -> attn_output_gate ? lq * 2 : lq ;
34833528 if (layer -> gguf_wq )
3484- est_fp32 += (size_t )qg_dim * dim * sizeof (float );
3529+ est_fp32 += (size_t )lqg * dim * sizeof (float );
34853530 if (layer -> gguf_wk )
34863531 est_fp32 += (size_t )lkv * dim * sizeof (float );
34873532 if (layer -> gguf_wv )
34883533 est_fp32 += (size_t )lkv * dim * sizeof (float );
34893534 if (layer -> gguf_wo )
3490- est_fp32 += (size_t )dim * q_dim * sizeof (float );
3535+ est_fp32 += (size_t )dim * lq * sizeof (float );
34913536 /* Dense FFN weights (not present in MoE layers) */
34923537 if (layer -> gguf_w_gate )
34933538 est_fp32 += (size_t )inter * dim * sizeof (float );
@@ -3509,6 +3554,11 @@ tq_model_t* tq_load_gguf(const char* path) {
35093554 }
35103555
35113556 const size_t MAX_FP32_BYTES = (size_t )16 * 1024 * 1024 * 1024ULL ; /* 16 GB */
3557+ /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
3558+ if (getenv ("TQ_NO_Q4" )) {
3559+ fprintf (stderr , "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n" );
3560+ goto skip_q4_conversion ;
3561+ }
35123562 int has_gguf_weights = 0 ;
35133563 for (int l = 0 ; l < c -> n_layers && !has_gguf_weights ; l ++ ) {
35143564 if (model -> layers [l ].gguf_wq || model -> layers [l ].gguf_w_gate
@@ -3532,8 +3582,11 @@ tq_model_t* tq_load_gguf(const char* path) {
35323582 tq_layer_weights_t * layer = & model -> layers [l ];
35333583
35343584 /* Self-attention weights: dequant GGUF -> FP32 */
3585+ int is_full = (model -> layer_is_sliding && !model -> layer_is_sliding [l ]);
3586+ int lq = is_full ? (c -> full_n_heads * c -> full_head_dim ) : q_dim ;
3587+ int lqg = c -> attn_output_gate ? lq * 2 : lq ;
35353588 if (layer -> gguf_wq ) {
3536- int n = qg_dim * dim ;
3589+ int n = lqg * dim ;
35373590 float * fp = (float * )malloc ((size_t )n * sizeof (float ));
35383591 if (fp ) {
35393592 tq_dequant_row_gguf (layer -> gguf_wq_type , layer -> gguf_wq , fp , n );
@@ -3565,7 +3618,7 @@ tq_model_t* tq_load_gguf(const char* path) {
35653618 }
35663619 }
35673620 if (layer -> gguf_wo ) {
3568- int n = dim * q_dim ;
3621+ int n = dim * lq ;
35693622 float * fp = (float * )malloc ((size_t )n * sizeof (float ));
35703623 if (fp ) {
35713624 tq_dequant_row_gguf (layer -> gguf_wo_type , layer -> gguf_wo , fp , n );
@@ -3673,6 +3726,7 @@ tq_model_t* tq_load_gguf(const char* path) {
36733726 fprintf (stderr , "tq_load_gguf: Q4 conversion complete — fast matmul path active\n" );
36743727 }
36753728
3729+ skip_q4_conversion : ;
36763730 /* ============================================================
36773731 * MoE shared expert Q4 conversion + LRU cache init
36783732 *
0 commit comments