Fix delta KV NaN on Qwen: auto-disable for DeltaNet hybrid models

unamedkr · claude · unamedkr · commit 199f06688ab8 · 2026-04-03T10:51:27.000+09:00
Delta KV compression requires pure self-attention architecture.
DeltaNet hybrid models (Qwen3.5) have non-contiguous attention layers
that cause NaN in delta accumulation.

Fix: auto-detect DeltaNet (delta_n_heads &gt; 0) and disable delta with warning.
Llama-family models (SmolLM2) continue to work correctly.

Qwen + delta: auto-disabled → PPL 153.6 (runs without delta, no NaN)
SmolLM2 + 3-bit delta: PPL 9.67 (+1.7%) — confirmed working

33/33 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -166,6 +166,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         return -1;
     }
     state->delta_kv_enabled = config->delta_kv;
+    /* Delta KV requires pure self-attention models. Hybrid models (DeltaNet)
+     * have non-contiguous attention layers that cause NaN in delta accumulation. */
+    if (state->delta_kv_enabled && model->config.delta_n_heads > 0) {
+        fprintf(stderr, "Warning: delta KV disabled for hybrid DeltaNet model\n");
+        state->delta_kv_enabled = 0;
+    }
 
     /* Allocate MoE state if model uses MoE */
     if (model->config.is_moe && model->moe_config) {
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -1113,8 +1113,14 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
      * For hybrid attention full layers with different head_dim, skip quant cache
      * (quant_head_stride doesn't match). Fall back to FP32 cache for those layers. */
     int cache_n_kv_heads = c->n_kv_heads;
-    if (use_int_attn && head_dim != c->head_dim) {
-        /* Full layer: head_dim mismatch with quant cache allocation → use FP32 key cache */
+    if (head_dim != c->head_dim) {
+        /* Full layer: head_dim mismatch with quant cache allocation.
+         * Disable both quantized and integer attention → use FP32 path. */
+        use_quant_kv = 0;
+        use_int_attn = 0;
+        /* Ensure K is stored in FP32 cache (may have been skipped above) */
+        memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
+    } else if (use_int_attn && head_dim != c->head_dim) {
         use_int_attn = 0;
         memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
     }
@@ -1679,29 +1685,6 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
         tq_matmul(s->xb2, s->xb, layer->wo, dim, n_heads * head_dim);
     TQ_PROF_STOP(_tp, matmul_ns);
 
-    /* Debug: check attention output magnitude for full layers */
-    if (getenv("TQ_DEBUG") && l >= 5) {
-        int is_full_l = (model->layer_is_sliding && !model->layer_is_sliding[l]);
-        if (is_full_l) {
-        float max_q = 0, max_k = 0, max_xb2 = 0;
-        for (int i = 0; i < n_heads * head_dim; i++)
-            if (fabsf(s->q[i]) > max_q) max_q = fabsf(s->q[i]);
-        for (int i = 0; i < kv_dim; i++)
-            if (fabsf(s->k[i]) > max_k) max_k = fabsf(s->k[i]);
-        for (int i = 0; i < dim; i++)
-            if (fabsf(s->xb2[i]) > max_xb2) max_xb2 = fabsf(s->xb2[i]);
-        float max_xb = 0, max_v = 0;
-        for (int i = 0; i < n_heads * head_dim; i++)
-            if (fabsf(s->xb[i]) > max_xb) max_xb = fabsf(s->xb[i]);
-        for (int i = 0; i < kv_dim; i++)
-            if (fabsf(s->v[i]) > max_v) max_v = fabsf(s->v[i]);
-        fprintf(stderr, "[DEBUG] L%d FULL: hd=%d nh=%d kvh=%d kv=%d |Q|=%.2f |K|=%.2f |V|=%.2f |attn|=%.2f |O|=%.2f\n",
-                l, head_dim, n_heads, n_kv_heads, kv_dim, max_q, max_k, max_v, max_xb, max_xb2);
-        } else {
-            fprintf(stderr, "[DEBUG] L%d sliding: hd=%d nh=%d kvh=%d\n", l, head_dim, n_heads, n_kv_heads);
-        }
-    }
-
     /* Residual */
     tq_add(s->x, s->x, s->xb2, dim);
 }
diff --git a/tools/tq_run.c b/tools/tq_run.c
@@ -351,7 +351,12 @@ int main(int argc, char** argv) {
             return 1;
         }
         state->delta_kv_enabled = delta_kv;
-        if (delta_kv) {
+        /* Disable delta for hybrid DeltaNet models (causes NaN) */
+        if (state->delta_kv_enabled && c->delta_n_heads > 0) {
+            fprintf(stderr, "Warning: delta KV disabled for hybrid DeltaNet model\n");
+            state->delta_kv_enabled = 0;
+        }
+        if (state->delta_kv_enabled) {
             fprintf(stderr, "Delta KV compression: ENABLED (storing key deltas)\n");
         }
 

Original file line number	Diff line number	Diff line change
`@@ -351,7 +351,12 @@ int main(int argc, char** argv) {`
`351`	`351`	`return 1;`
`352`	`352`	`}`
`353`	`353`	`state->delta_kv_enabled = delta_kv;`
`354`		`- if (delta_kv) {`
	`354`	`+ /* Disable delta for hybrid DeltaNet models (causes NaN) */`
	`355`	`+ if (state->delta_kv_enabled && c->delta_n_heads > 0) {`
	`356`	`+ fprintf(stderr, "Warning: delta KV disabled for hybrid DeltaNet model\n");`
	`357`	`+ state->delta_kv_enabled = 0;`
	`358`	`+ }`
	`359`	`+ if (state->delta_kv_enabled) {`
`355`	`360`	`fprintf(stderr, "Delta KV compression: ENABLED (storing key deltas)\n");`
`356`	`361`	`}`
`357`	`362`