@@ -2829,18 +2829,53 @@ tq_model_t* tq_load_gguf(const char* path) {
28292829 c -> rope_freq_base = tq_gguf_get_f32 (gguf , GGUF_KEY ("rope.freq_base" ), 1000000.0f );
28302830 c -> rms_norm_eps = tq_gguf_get_f32 (gguf , GGUF_KEY ("attention.layer_norm_rms_epsilon" ), 1e-6f );
28312831
2832+ /* Sliding window + local RoPE base */
2833+ c -> sliding_window = (int )tq_gguf_get_u32 (gguf , GGUF_KEY ("attention.sliding_window" ), 0 );
2834+ c -> rope_local_base_freq = tq_gguf_get_f32 (gguf , GGUF_KEY ("rope.local.freq_base" ),
2835+ tq_gguf_get_f32 (gguf , GGUF_KEY ("rope.freq_base" ), 10000.0f ));
2836+
28322837 /* Cap context for memory safety on small machines.
28332838 * GGUF models often claim 262K context but we cap at 4096 by default.
28342839 * Users can override with --ctx flag in tq_run. */
28352840 if (c -> max_seq_len > 4096 ) c -> max_seq_len = 4096 ;
28362841
2837- /* Compute head_dim — prefer explicit key_length from metadata (Qwen3.5 has
2838- * head_dim > hidden_dim/n_heads because attention expands the dimension) */
2842+ /* Compute head_dim — prefer explicit key_length from metadata.
2843+ * For Gemma 4: key_length=512 is for full attention layers,
2844+ * but sliding layers use 256. Detect from first layer's K tensor shape. */
28392845 c -> head_dim = tq_gguf_get_i32 (gguf , GGUF_KEY ("attention.key_length" ), 0 );
28402846 if (c -> head_dim == 0 && c -> n_heads > 0 ) {
28412847 c -> head_dim = c -> hidden_dim / c -> n_heads ;
28422848 }
28432849
2850+ /* For hybrid sliding/full attention (Gemma 4):
2851+ * Override head_dim from first layer's K tensor shape (sliding layer),
2852+ * since sliding layers are the majority and determine KV cache layout. */
2853+ {
2854+ const tq_gguf_tensor_t * k0 = tq_gguf_find_tensor (gguf , "blk.0.attn_k.weight" );
2855+ if (k0 && k0 -> n_dims >= 2 ) {
2856+ int k_out = (int )k0 -> shape [1 ];
2857+ /* Try head_dim candidates: check if k_out / head_dim gives integer kv_heads */
2858+ /* Try from largest to smallest to prefer larger head_dim */
2859+ int sliding_head_dim = c -> head_dim ;
2860+ for (int hd = 512 ; hd >= 64 ; hd /= 2 ) {
2861+ if (k_out % hd == 0 ) {
2862+ int kv = k_out / hd ;
2863+ if (kv >= 1 && kv <= c -> n_heads && hd < c -> head_dim ) {
2864+ sliding_head_dim = hd ;
2865+ break ;
2866+ }
2867+ }
2868+ }
2869+ if (sliding_head_dim != c -> head_dim ) {
2870+ fprintf (stderr , "tq_load_gguf: hybrid attention detected — "
2871+ "sliding head_dim=%d (metadata: %d)\n" , sliding_head_dim , c -> head_dim );
2872+ c -> head_dim = sliding_head_dim ;
2873+ }
2874+ /* Infer kv_heads from K tensor shape */
2875+ c -> n_kv_heads = k_out / c -> head_dim ;
2876+ }
2877+ }
2878+
28442879 /* MoE configuration */
28452880 c -> num_experts = tq_gguf_get_i32 (gguf , GGUF_KEY ("expert_count" ), 0 );
28462881 c -> num_active_experts = tq_gguf_get_i32 (gguf , GGUF_KEY ("expert_used_count" ), 0 );
@@ -2873,11 +2908,15 @@ tq_model_t* tq_load_gguf(const char* path) {
28732908 c -> expert_intermediate_dim , c -> has_shared_expert );
28742909 }
28752910
2876- /* Model type detection */
2877- if (c -> is_moe ) {
2878- c -> model_type = 2 ; /* qwen2moe / qwen3.5 moe */
2911+ /* Model type detection — Gemma takes priority (Gemma 4 is both Gemma AND MoE) */
2912+ if (strstr (gguf -> arch , "gemma" ) != NULL ) {
2913+ c -> model_type = 1 ; /* gemma family */
2914+ c -> n_norms_per_block = 4 ;
2915+ fprintf (stderr , "tq_load_gguf: Gemma family detected (sliding_window=%d)\n" , c -> sliding_window );
2916+ } else if (c -> is_moe ) {
2917+ c -> model_type = 2 ; /* qwen moe */
28792918 } else {
2880- c -> model_type = 0 ; /* default qwen35 */
2919+ c -> model_type = 0 ; /* qwen35 */
28812920 }
28822921
28832922 fprintf (stderr , "tq_load_gguf: config — layers=%d, dim=%d, heads=%d/%d, head_dim=%d, vocab=%d\n" ,
@@ -3206,6 +3245,39 @@ tq_model_t* tq_load_gguf(const char* path) {
32063245 n_attn_layers , c -> n_layers );
32073246 }
32083247
3248+ /* Set up layer_is_sliding for Gemma hybrid attention.
3249+ * Detect from Q tensor shape: sliding layers have smaller Q output dim. */
3250+ if (c -> sliding_window > 0 && c -> model_type == 1 ) {
3251+ model -> layer_is_sliding = (int * )calloc ((size_t )c -> n_layers , sizeof (int ));
3252+ if (model -> layer_is_sliding ) {
3253+ /* Find the smallest Q output dim (sliding) */
3254+ int min_q = 999999 ;
3255+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
3256+ char tname [128 ];
3257+ snprintf (tname , sizeof (tname ), "blk.%d.attn_q.weight" , l );
3258+ const tq_gguf_tensor_t * qt = tq_gguf_find_tensor (gguf , tname );
3259+ if (qt && (int )qt -> shape [1 ] < min_q ) min_q = (int )qt -> shape [1 ];
3260+ }
3261+ int n_sliding = 0 , n_full = 0 ;
3262+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
3263+ char tname [128 ];
3264+ snprintf (tname , sizeof (tname ), "blk.%d.attn_q.weight" , l );
3265+ const tq_gguf_tensor_t * qt = tq_gguf_find_tensor (gguf , tname );
3266+ if (qt && (int )qt -> shape [1 ] == min_q ) {
3267+ model -> layer_is_sliding [l ] = 1 ;
3268+ n_sliding ++ ;
3269+ } else {
3270+ model -> layer_is_sliding [l ] = 0 ;
3271+ n_full ++ ;
3272+ }
3273+ }
3274+ if (n_full > 0 ) {
3275+ fprintf (stderr , "tq_load_gguf: Gemma hybrid — %d sliding + %d full attention layers\n" ,
3276+ n_sliding , n_full );
3277+ }
3278+ }
3279+ }
3280+
32093281 /* Load embedding + output weights */
32103282 const tq_gguf_tensor_t * emb_t = find_gguf_tensor (gguf , "token_embd.weight" );
32113283 if (emb_t ) {
0 commit comments