Skip to content

Commit 199f066

Browse files
unamedkrclaude
andcommitted
Fix delta KV NaN on Qwen: auto-disable for DeltaNet hybrid models
Delta KV compression requires pure self-attention architecture. DeltaNet hybrid models (Qwen3.5) have non-contiguous attention layers that cause NaN in delta accumulation. Fix: auto-detect DeltaNet (delta_n_heads > 0) and disable delta with warning. Llama-family models (SmolLM2) continue to work correctly. Qwen + delta: auto-disabled → PPL 153.6 (runs without delta, no NaN) SmolLM2 + 3-bit delta: PPL 9.67 (+1.7%) — confirmed working 33/33 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 9c38016 commit 199f066

3 files changed

Lines changed: 20 additions & 26 deletions

File tree

src/engine/tq_generate.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
166166
return -1;
167167
}
168168
state->delta_kv_enabled = config->delta_kv;
169+
/* Delta KV requires pure self-attention models. Hybrid models (DeltaNet)
170+
* have non-contiguous attention layers that cause NaN in delta accumulation. */
171+
if (state->delta_kv_enabled && model->config.delta_n_heads > 0) {
172+
fprintf(stderr, "Warning: delta KV disabled for hybrid DeltaNet model\n");
173+
state->delta_kv_enabled = 0;
174+
}
169175

170176
/* Allocate MoE state if model uses MoE */
171177
if (model->config.is_moe && model->moe_config) {

src/engine/tq_transformer.c

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,8 +1113,14 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
11131113
* For hybrid attention full layers with different head_dim, skip quant cache
11141114
* (quant_head_stride doesn't match). Fall back to FP32 cache for those layers. */
11151115
int cache_n_kv_heads = c->n_kv_heads;
1116-
if (use_int_attn && head_dim != c->head_dim) {
1117-
/* Full layer: head_dim mismatch with quant cache allocation → use FP32 key cache */
1116+
if (head_dim != c->head_dim) {
1117+
/* Full layer: head_dim mismatch with quant cache allocation.
1118+
* Disable both quantized and integer attention → use FP32 path. */
1119+
use_quant_kv = 0;
1120+
use_int_attn = 0;
1121+
/* Ensure K is stored in FP32 cache (may have been skipped above) */
1122+
memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
1123+
} else if (use_int_attn && head_dim != c->head_dim) {
11181124
use_int_attn = 0;
11191125
memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
11201126
}
@@ -1679,29 +1685,6 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
16791685
tq_matmul(s->xb2, s->xb, layer->wo, dim, n_heads * head_dim);
16801686
TQ_PROF_STOP(_tp, matmul_ns);
16811687

1682-
/* Debug: check attention output magnitude for full layers */
1683-
if (getenv("TQ_DEBUG") && l >= 5) {
1684-
int is_full_l = (model->layer_is_sliding && !model->layer_is_sliding[l]);
1685-
if (is_full_l) {
1686-
float max_q = 0, max_k = 0, max_xb2 = 0;
1687-
for (int i = 0; i < n_heads * head_dim; i++)
1688-
if (fabsf(s->q[i]) > max_q) max_q = fabsf(s->q[i]);
1689-
for (int i = 0; i < kv_dim; i++)
1690-
if (fabsf(s->k[i]) > max_k) max_k = fabsf(s->k[i]);
1691-
for (int i = 0; i < dim; i++)
1692-
if (fabsf(s->xb2[i]) > max_xb2) max_xb2 = fabsf(s->xb2[i]);
1693-
float max_xb = 0, max_v = 0;
1694-
for (int i = 0; i < n_heads * head_dim; i++)
1695-
if (fabsf(s->xb[i]) > max_xb) max_xb = fabsf(s->xb[i]);
1696-
for (int i = 0; i < kv_dim; i++)
1697-
if (fabsf(s->v[i]) > max_v) max_v = fabsf(s->v[i]);
1698-
fprintf(stderr, "[DEBUG] L%d FULL: hd=%d nh=%d kvh=%d kv=%d |Q|=%.2f |K|=%.2f |V|=%.2f |attn|=%.2f |O|=%.2f\n",
1699-
l, head_dim, n_heads, n_kv_heads, kv_dim, max_q, max_k, max_v, max_xb, max_xb2);
1700-
} else {
1701-
fprintf(stderr, "[DEBUG] L%d sliding: hd=%d nh=%d kvh=%d\n", l, head_dim, n_heads, n_kv_heads);
1702-
}
1703-
}
1704-
17051688
/* Residual */
17061689
tq_add(s->x, s->x, s->xb2, dim);
17071690
}

tools/tq_run.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,12 @@ int main(int argc, char** argv) {
351351
return 1;
352352
}
353353
state->delta_kv_enabled = delta_kv;
354-
if (delta_kv) {
354+
/* Disable delta for hybrid DeltaNet models (causes NaN) */
355+
if (state->delta_kv_enabled && c->delta_n_heads > 0) {
356+
fprintf(stderr, "Warning: delta KV disabled for hybrid DeltaNet model\n");
357+
state->delta_kv_enabled = 0;
358+
}
359+
if (state->delta_kv_enabled) {
355360
fprintf(stderr, "Delta KV compression: ENABLED (storing key deltas)\n");
356361
}
357362

0 commit comments

Comments
 (0)