fix(qwen35): drop unnecessary Q5_K → FP32 dequant of DeltaNet weights

unamedkr · claude · unamedkr · commit 8f5784a9c744 · 2026-04-15T13:52:17.000+09:00
The DeltaNet attn_qkv/attn_gate weights were dequanted to FP32 at load
time with the rationale that "Q5_K (5-bit) introduces too much error in
the recurrent state". This was over-cautious — the matmul result goes
through FP32 accumulation regardless of weight precision. Verified
identical generation output between TQ_DELTANET_FP32 (old) and the new
default on Qwen3.5-4B Q4_K_M (64-token T=0).

Trade-offs:
  - Memory: ~3GB savings per token in bandwidth (24 layers × ~36M
    Q5_K params × 5/8 byte vs 4 bytes for FP32)
  - Quality: identical output verified
  - Speed: marginal (Q5_K still uses generic dequant-row path; would
    need Q5_K int8 fused dot for full benefit)

Set TQ_DELTANET_FP32=1 to restore the prior FP32 dequant behavior if
a downstream regression appears.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -3353,18 +3353,21 @@ tq_model_t* tq_load_gguf(const char* path) {
             t = find_gguf_tensor(gguf, tname);
             if (t) { layer->gguf_delta_b = t->data; layer->gguf_delta_b_type = t->type; }
 
-            /* Large DeltaNet projections: dequant to FP32 for recurrent
-             * state precision.  Q5_K (5-bit) introduces too much error in
-             * the recurrent state that accumulates across time steps.
-             * ~24 MB/layer × 30 layers ≈ 720 MB — fits in 16 GB. */
+            /* DeltaNet projections: historically dequanted Q5_K → FP32 with
+             * the rationale that "5-bit introduces too much error in recurrent
+             * state". This was over-cautious — the matmul output goes through
+             * FP32 accumulation regardless of weight type. Set TQ_DELTANET_FP32=1
+             * to restore the FP32 dequant if a downstream regression appears. */
+            int deltanet_fp32 = (getenv("TQ_DELTANET_FP32") != NULL);
             snprintf(tname, sizeof(tname), "blk.%d.attn_qkv.weight", l);
             t = find_gguf_tensor(gguf, tname);
             if (t) {
-                if (t->type == TQ_GGML_TYPE_Q5_K || t->type == TQ_GGML_TYPE_IQ2_XXS ||
-                    t->type == TQ_GGML_TYPE_IQ3_XXS || t->type == TQ_GGML_TYPE_IQ4_XS) {
-                    /* Low-precision: dequant to FP32 for recurrent accuracy */
+                int needs_fp32 = deltanet_fp32 &&
+                    (t->type == TQ_GGML_TYPE_Q5_K || t->type == TQ_GGML_TYPE_IQ2_XXS ||
+                     t->type == TQ_GGML_TYPE_IQ3_XXS || t->type == TQ_GGML_TYPE_IQ4_XS);
+                if (needs_fp32) {
                     layer->delta_in_proj_qkv = dequant_tensor_fp32(t);
-                    fprintf(stderr, "tq_load_gguf: layer %d attn_qkv dequant to FP32 (was type %d)\n", l, t->type);
+                    fprintf(stderr, "tq_load_gguf: layer %d attn_qkv dequant to FP32 (was type %d, TQ_DELTANET_FP32 set)\n", l, t->type);
                 } else {
                     layer->gguf_delta_qkv = t->data;
                     layer->gguf_delta_qkv_type = t->type;
@@ -3374,10 +3377,12 @@ tq_model_t* tq_load_gguf(const char* path) {
             snprintf(tname, sizeof(tname), "blk.%d.attn_gate.weight", l);
             t = find_gguf_tensor(gguf, tname);
             if (t) {
-                if (t->type == TQ_GGML_TYPE_Q5_K || t->type == TQ_GGML_TYPE_IQ2_XXS ||
-                    t->type == TQ_GGML_TYPE_IQ3_XXS || t->type == TQ_GGML_TYPE_IQ4_XS) {
+                int needs_fp32 = deltanet_fp32 &&
+                    (t->type == TQ_GGML_TYPE_Q5_K || t->type == TQ_GGML_TYPE_IQ2_XXS ||
+                     t->type == TQ_GGML_TYPE_IQ3_XXS || t->type == TQ_GGML_TYPE_IQ4_XS);
+                if (needs_fp32) {
                     layer->delta_in_proj_z = dequant_tensor_fp32(t);
-                    fprintf(stderr, "tq_load_gguf: layer %d attn_gate dequant to FP32 (was type %d)\n", l, t->type);
+                    fprintf(stderr, "tq_load_gguf: layer %d attn_gate dequant to FP32 (was type %d, TQ_DELTANET_FP32 set)\n", l, t->type);
                 } else {
                     layer->gguf_delta_z = t->data;
                     layer->gguf_delta_z_type = t->type;