Skip to content

Commit 9c38016

Browse files
unamedkrclaude
andcommitted
Delta KV: NaN on Qwen (DeltaNet hybrid) — Llama verified, Qwen needs fix
Delta compression verified on SmolLM2 (Llama arch, pure attention). Qwen3.5 (DeltaNet hybrid, only 6/24 attn layers) produces NaN — delta accumulation interacts with DeltaNet's non-attention layers. Needs: delta mode should only apply to attention layers, not DeltaNet. Llama-family models work correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e39d86e commit 9c38016

1 file changed

Lines changed: 18 additions & 6 deletions

File tree

src/engine/tq_transformer.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,16 +1680,26 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
16801680
TQ_PROF_STOP(_tp, matmul_ns);
16811681

16821682
/* Debug: check attention output magnitude for full layers */
1683-
if (getenv("TQ_DEBUG") && model->layer_is_sliding && !model->layer_is_sliding[l]) {
1683+
if (getenv("TQ_DEBUG") && l >= 5) {
1684+
int is_full_l = (model->layer_is_sliding && !model->layer_is_sliding[l]);
1685+
if (is_full_l) {
16841686
float max_q = 0, max_k = 0, max_xb2 = 0;
16851687
for (int i = 0; i < n_heads * head_dim; i++)
16861688
if (fabsf(s->q[i]) > max_q) max_q = fabsf(s->q[i]);
16871689
for (int i = 0; i < kv_dim; i++)
16881690
if (fabsf(s->k[i]) > max_k) max_k = fabsf(s->k[i]);
16891691
for (int i = 0; i < dim; i++)
16901692
if (fabsf(s->xb2[i]) > max_xb2) max_xb2 = fabsf(s->xb2[i]);
1691-
fprintf(stderr, "[DEBUG] L%d full: hd=%d nh=%d kvh=%d kv=%d |Q|=%.2f |K|=%.2f |O|=%.2f\n",
1692-
l, head_dim, n_heads, n_kv_heads, kv_dim, max_q, max_k, max_xb2);
1693+
float max_xb = 0, max_v = 0;
1694+
for (int i = 0; i < n_heads * head_dim; i++)
1695+
if (fabsf(s->xb[i]) > max_xb) max_xb = fabsf(s->xb[i]);
1696+
for (int i = 0; i < kv_dim; i++)
1697+
if (fabsf(s->v[i]) > max_v) max_v = fabsf(s->v[i]);
1698+
fprintf(stderr, "[DEBUG] L%d FULL: hd=%d nh=%d kvh=%d kv=%d |Q|=%.2f |K|=%.2f |V|=%.2f |attn|=%.2f |O|=%.2f\n",
1699+
l, head_dim, n_heads, n_kv_heads, kv_dim, max_q, max_k, max_v, max_xb, max_xb2);
1700+
} else {
1701+
fprintf(stderr, "[DEBUG] L%d sliding: hd=%d nh=%d kvh=%d\n", l, head_dim, n_heads, n_kv_heads);
1702+
}
16931703
}
16941704

16951705
/* Residual */
@@ -1765,9 +1775,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
17651775
if (layer->delta_a_log) {
17661776
/* DeltaNet layer */
17671777
deltanet_forward(model, s, l);
1768-
} else if ((layer->wq || layer->wq_q8 || layer->wq_q4 || layer->gguf_wq) &&
1769-
(layer->wk || layer->wk_q8 || layer->wk_q4 || layer->gguf_wk) &&
1770-
(layer->wv || layer->wv_q8 || layer->wv_q4 || layer->gguf_wv)) {
1778+
} else if ((layer->wq || layer->wq_q8 || layer->wq_q4 || layer->gguf_wq || layer->wq_q2) &&
1779+
(layer->wk || layer->wk_q8 || layer->wk_q4 || layer->gguf_wk || layer->wk_q2) &&
1780+
(layer->wv || layer->wv_q8 || layer->wv_q4 || layer->gguf_wv || layer->wv_q2 ||
1781+
/* K=V layers (Gemma 4 full attention): no V weights needed */
1782+
(model->layer_is_sliding && !model->layer_is_sliding[l]))) {
17711783
/* Standard self-attention layer */
17721784
self_attn_forward(model, s, l, pos);
17731785

0 commit comments

Comments
 (0)