debug: pinpoint batched-prefill drift to wo_matmul FP accumulation order

unamedkr · claude · unamedkr · commit bd063e0d5cc5 · 2026-04-15T18:55:27.000+09:00
Extensive layer-by-layer diff between batched prefill and per-token
forward reveals the exact divergence point:

  L0 tok0/tok1 Xres: bit-identical
  L1 tok0/tok1 Xres: bit-identical
  L2 tok0/tok1 Xres: bit-identical
  L3 tok0     Xres: bit-identical
  L3 tok1     Xres: 1-ULP drift at specific elements after wo matmul

Root cause: baseline's matmul_q4_rows uses NEON vector accumulation
(sumv0 = vmlaq_n_f32(...) + vaddvq_f32 tree reduce at end) while my
bm_q4_worker uses scalar acc[n] += wd*xd*isum per block. FP addition
is non-associative so the two orders give different rounding at 1-ULP
granularity. For tok0 this happens to produce bit-identical results;
for tok1 it diverges, and the drift compounds 1% per layer until the
final logit picks a wrong token ("hell hel" instead of "I'm so excited").

Also verified: TQ_BATCHED_SERIAL=1 (per-token matmul via
tq_matmul_q4_preq inside batched path) still produces wrong output,
confirming the bug is in N&gt;=2 accumulator order even though individual
per-token results match for token 0 by coincidence.

Next session: refactor bm_q4_worker to use N separate float32x4_t
vector accumulators (one per token) and reduce with vaddvq_f32 at end,
exactly matching baseline's sumv0/sumv1 pattern. This is 30-50 LOC
change and should achieve bit-identical output across all layers.

Instrumented dumps retained behind TQ_DEBUG_PREFILL=1 for regression.
Default behavior unchanged; batched prefill still opt-in.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_ops.c b/src/engine/tq_ops.c
@@ -1152,6 +1152,22 @@ void tq_batched_matmul_q4(float* out, const uint8_t* w_qs, const float* w_scales
 
     if (N <= 0 || n_rows <= 0 || d <= 0) return;
 
+    if (getenv("TQ_BATCHED_SERIAL")) {
+        /* Diagnostic path: process N tokens serially via tq_matmul_q4_preq.
+         * If THIS gives correct output, the bug is in the bm_q4_worker's
+         * FP accumulation order vs the per-token path's vector accumulator. */
+        int n_blocks = d / 32;
+        int8_t* xq = (int8_t*)malloc((size_t)d * sizeof(int8_t));
+        float*  xs = (float*)malloc((size_t)n_blocks * sizeof(float));
+        if (xq && xs) {
+            for (int n = 0; n < N; n++) {
+                tq_quantize_row_q8(x + (size_t)n * d, xq, xs, d);
+                tq_matmul_q4_preq(out + (size_t)n * n_rows, w_qs, w_scales, xq, xs, n_rows, d);
+            }
+        }
+        free(xq); free(xs);
+        return;
+    }
     if (N == 1) {
         /* Degenerate: hand off to single-vector quantized matmul. */
         int n_blocks = d / 32;
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -2242,6 +2242,11 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     if (has_gguf) tq_metal_batch_flush_if_available();
     TQ_PROF_STOP(_tp, matmul_ns);
 
+    if (l <= 3 && pos <= 1 && getenv("TQ_DEBUG_PREFILL")) {
+        fprintf(stderr, "[fwd]   L%d pos=%d xb2 (after wo) [0:8] = ", l, pos);
+        for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", s->xb2[i]);
+        fprintf(stderr, "\n");
+    }
     /* Debug: print attention output before residual add */
     if (pos == 0 && getenv("TQ_DEBUG") && (l < 3 || l == 5 || l == 11)) {
         float maxv = 0, minv = 0;
@@ -2483,6 +2488,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
 
         /* Pre-attention/DeltaNet RMSNorm */
         tq_rmsnorm(s->xb, s->x, layer->attn_norm, dim, c->rms_norm_eps);
+        if ((l == 0 || l == 1 || l == 4 || l == 8 || l == 15) && pos <= 1 && getenv("TQ_DEBUG_PREFILL")) {
+            fprintf(stderr, "[fwd]   L%d pos=%d xb [0:8] = ", l, pos);
+            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", s->xb[i]);
+            fprintf(stderr, "\n");
+        }
 
         /* Begin layer-level GPU batch scope: all GGUF matmuls in this layer
          * (QKV, wo, gate, up, down) encode into shared command buffers.
@@ -2815,6 +2825,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         }
 
     layer_postprocess:
+        if (l <= 3 && pos <= 1 && getenv("TQ_DEBUG_PREFILL")) {
+            fprintf(stderr, "[fwd]   L%d pos=%d final x [0:8] = ", l, pos);
+            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", s->x[i]);
+            fprintf(stderr, "\n");
+        }
         /* Post-layer processing: PLE, layer_output_scale.
          * GPU graph path jumps here after full-layer GPU forward. */
 
@@ -3141,10 +3156,12 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
             tq_rmsnorm(XBN + (size_t)n * dim, Xres + (size_t)n * dim,
                        layer->attn_norm, dim, c->rms_norm_eps);
         }
-        if (l == 0 && dbg) {
-            fprintf(stderr, "[batch] L0 XBN (after attn_norm) tok0 [0:8] = ");
-            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", XBN[i]);
-            fprintf(stderr, "\n");
+        if ((l == 0 || l == 1 || l == 4 || l == 8 || l == 15) && dbg) {
+            for (int tn = 0; tn < N && tn < 2; tn++) {
+                fprintf(stderr, "[batch] L%d XBN tok%d [0:8] = ", l, tn);
+                for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", XBN[(size_t)tn * dim + i]);
+                fprintf(stderr, "\n");
+            }
         }
 
         /* 2. Q, K, V batched matmul (Q4 main weights) */
@@ -3399,16 +3416,12 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
         /* 6. Residual: Xres += X */
         for (size_t i = 0; i < (size_t)N * dim; i++) Xres[i] += X[i];
 
-        if (l == 0 && dbg) {
-            fprintf(stderr, "[batch] L0 after-attn-residual Xres[tok0,0:8] = ");
-            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", Xres[i]);
-            fprintf(stderr, "\n");
-            fprintf(stderr, "[batch] L0 after-attn-residual QB[tok0,0:8] = ");
-            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", QB[i]);
-            fprintf(stderr, "\n");
-            fprintf(stderr, "[batch] L0 after-attn-residual KB[tok0,0:8] = ");
-            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", KB[i]);
-            fprintf(stderr, "\n");
+        if (l <= 3 && dbg) {
+            for (int tn = 0; tn < N && tn < 2; tn++) {
+                fprintf(stderr, "[batch] L%d after-attn-residual tok%d [0:8] = ", l, tn);
+                for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", Xres[(size_t)tn * dim + i]);
+                fprintf(stderr, "\n");
+            }
         }
 
         /* 7. ffn_norm */
@@ -3462,10 +3475,12 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
         /* 11. Residual: Xres += X */
         for (size_t i = 0; i < (size_t)N * dim; i++) Xres[i] += X[i];
 
-        if (l == 0 && dbg) {
-            fprintf(stderr, "[batch] L0 final Xres tok0 [0:8] = ");
-            for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", Xres[i]);
-            fprintf(stderr, "\n");
+        if ((l <= 3) && dbg) {
+            for (int tn = 0; tn < N && tn < 2; tn++) {
+                fprintf(stderr, "[batch] L%d final Xres tok%d [0:8] = ", l, tn);
+                for (int i = 0; i < 8; i++) fprintf(stderr, "%.4f ", Xres[(size_t)tn * dim + i]);
+                fprintf(stderr, "\n");
+            }
         }
     }