feat(prefill): produce logits inside tq_forward_batch (no final tq_forward)

unamedkr · claude · unamedkr · commit bd347dbdf091 · 2026-04-16T03:27:45.000+09:00
Big architectural improvement to the batched prefill path: the output rmsnorm + lm_head matmul for the last batch position is now computed inside tq_forward_batch itself, and tq_generate no longer calls tq_forward after a successful batched prefill. Benefits: - One fewer full forward pass → small extra speedup for long prompts - For DeltaNet models, avoids double-advancing the recurrent SSM state (the root cause of the empty-output bug in the DeltaNet hybrid path) Verified on Llama-3.2-1B/3B: outputs bit-identical to the previous per-token-then-final-forward flow. 11/11 STRICT tests pass. DeltaNet hybrid path (P1.6) still bails to per-token for Qwen3.5 due to a separate FFN-handling issue in the per-token-inside-batch DeltaNet loop that produces empty text. Gated behind TQ_DELTANET_BATCH=1. Investigation logged in commit c6c9fda. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -316,8 +316,9 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
             fprintf(stderr, "[batch_prefill] rc=%d expected=%d (N=%d)\n",
                     rc, prefill_start + n_prompt, n_prompt);
         if (rc == prefill_start + n_prompt) {
-            tq_forward(model, state, prompt_tokens[n_prompt - 1],
-                       prefill_start + n_prompt - 1);
+            /* tq_forward_batch now produces logits for the last position
+             * itself (so we don't double-advance DeltaNet SSM state). No
+             * final tq_forward needed. */
             batch_ok = 1;
         }
     }
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -3070,11 +3070,9 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
     if (s->delta_kv_enabled)                   { if (dbg) fprintf(stderr, "[batch] bail: delta_kv\n"); return -1; }
     /* k_highres_window supported — circular FP32 buffer for recent keys. */
     if (s->value_quant_bits != 0)              { if (dbg) fprintf(stderr, "[batch] bail: quant_V\n"); return -1; }
-    /* DeltaNet hybrid support is in-progress (see P1.6). For safety the
-     * bail is kept — batched advances SSM state per token and the final
-     * tq_forward's re-run of the last position double-advances state,
-     * producing empty/garbage generation. Path preserved below under
-     * TQ_DELTANET_BATCH=1 for future development. */
+    /* DeltaNet: hybrid batched is WIP. Default bail to per-token; opt-in
+     * via TQ_DELTANET_BATCH=1 for development. Known issue: FFN handling
+     * for DeltaNet layers in Qwen3.5 still produces empty output. */
     if (!getenv("TQ_DELTANET_BATCH")) {
         for (int l = 0; l < c->n_layers; l++) {
             if (model->layers[l].delta_a_log) {
@@ -3153,11 +3151,11 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
          * per-token because deltanet_forward writes residual into s->x and
          * we continue from there. */
         if (layer->delta_a_log) {
-            /* DeltaNet: SSM recurrent state can't be batched. Process the
-             * first N-1 tokens here; leave the last token for the final
-             * tq_forward to avoid advancing state past what that call expects. */
+            /* DeltaNet: SSM recurrent state can't be batched. Process each
+             * token in order so state advances correctly; no final
+             * tq_forward runs after this function (logits computed below). */
             extern void deltanet_forward(tq_model_t* model, tq_state_t* s, int l);
-            for (int n = 0; n < N - 1; n++) {
+            for (int n = 0; n < N; n++) {
                 memcpy(s->x, Xres + (size_t)n * dim, (size_t)dim * sizeof(float));
                 tq_rmsnorm(s->xb, s->x, layer->attn_norm, dim, c->rms_norm_eps);
                 deltanet_forward(model, s, l);
@@ -3566,6 +3564,28 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
         }
     }
 
+    /* Compute logits for the LAST token in the batch so the caller can
+     * skip running tq_forward again. For non-DeltaNet models this is just
+     * a convenience; for DeltaNet it's required to avoid double-advancing
+     * the recurrent state. */
+    {
+        int last = N - 1;
+        float* x_last = Xres + (size_t)last * dim;
+        memcpy(s->x, x_last, (size_t)dim * sizeof(float));
+        tq_rmsnorm(s->x, s->x, model->output_norm, dim, c->rms_norm_eps);
+        if (model->output_gguf) {
+            tq_matmul_gguf(s->logits, s->x, model->output_gguf,
+                            model->output_gguf_type, c->vocab_size, dim);
+        } else if (model->output_qs) {
+            tq_matmul_q4(s->logits, s->x, model->output_qs, model->output_scales,
+                          c->vocab_size, dim);
+        } else if (model->output_weight_bf16) {
+            tq_matmul_bf16(s->logits, s->x, model->output_weight_bf16, c->vocab_size, dim);
+        } else if (model->output_weight) {
+            tq_matmul(s->logits, s->x, model->output_weight, c->vocab_size, dim);
+        }
+    }
+
     free(X); free(XBN); free(QB); free(KB); free(VB); free(OB); free(GB); free(UB);
     free(Xres);
     return pos_start + N;

Original file line number	Diff line number	Diff line change
`@@ -316,8 +316,9 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,`
`316`	`316`	`fprintf(stderr, "[batch_prefill] rc=%d expected=%d (N=%d)\n",`
`317`	`317`	`rc, prefill_start + n_prompt, n_prompt);`
`318`	`318`	`if (rc == prefill_start + n_prompt) {`
`319`		`- tq_forward(model, state, prompt_tokens[n_prompt - 1],`
`320`		`- prefill_start + n_prompt - 1);`
	`319`	`+ /* tq_forward_batch now produces logits for the last position`
	`320`	`+ * itself (so we don't double-advance DeltaNet SSM state). No`
	`321`	`+ * final tq_forward needed. */`
`321`	`322`	`batch_ok = 1;`
`322`	`323`	`}`
`323`	`324`	`}`