fix: support head_dim > TQ_BK (Qwen3.5 head_dim=256)

unamedkr · claude · unamedkr · commit 7d10b6367be3 · 2026-04-10T17:24:32.000+09:00
Qwen3.5 uses head_dim=256 while TQ_BK=128. Two fixes:

1. KV quantize: loop over TQ_BK-sized blocks per head instead of
   truncating to 128 elements. The remaining 128 dims were being
   zeroed, causing catastrophic PPL (1.6M → garbage output).

2. Fused attention: guard with head_dim &lt;= TQ_BK. For head_dim=256,
   fall back to dequant → FP32 attention path. This is slower but
   correct. Fused multi-block attention is a future optimization.

Before: Qwen3.5 + turbo_kv_4b → PPL 1,663,480 (garbage)
After:  Qwen3.5 + turbo_kv_4b → coherent text at 12.2 tok/s

35/35 tests pass. Llama models (head_dim=128) unaffected.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -1440,8 +1440,16 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
                     traits->quantize(delta_buf, quant_dst, head_dim);
                 }
             } else {
-                /* Non-delta mode: quantize absolute key */
-                traits->quantize(key_src, quant_dst, head_dim);
+                /* Non-delta mode: quantize absolute key.
+                 * For head_dim > TQ_BK (e.g. Qwen3.5 head_dim=256),
+                 * process multiple TQ_BK-sized blocks per head. */
+                for (int blk = 0; blk < head_dim; blk += TQ_BK) {
+                    int blen = head_dim - blk;
+                    if (blen > TQ_BK) blen = TQ_BK;
+                    traits->quantize(key_src + blk,
+                                     quant_dst + (blk / TQ_BK) * traits->type_size,
+                                     blen);
+                }
             }
         }
     }
@@ -1508,7 +1516,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
         float* atth = s->att + (size_t)h * c->max_seq_len;
         int kv_h = h / kv_mul;
 
-        if (use_int_attn && seq_len > int_attn_threshold) {
+        if (use_int_attn && seq_len > int_attn_threshold && head_dim <= TQ_BK) {
             /* Integer Q4xQ8 attention path.
              * Gather quantized key blocks for this KV head across all positions
              * into a contiguous buffer, then call the traits attention function.
@@ -1659,7 +1667,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
              * contiguous block, which is cache-efficient.
              */
             if (!needs_post_norm && !k_hr_active && traits->attention != NULL
-                && attn_start == 0) {
+                && attn_start == 0 && head_dim <= TQ_BK) {
                 size_t head_block_bytes = s->quant_head_stride;
                 size_t pos_stride_bytes = (size_t)cache_n_kv_heads * head_block_bytes;
                 uint8_t* layer_base = (uint8_t*)s->quant_key_cache