6767static id <MTLComputePipelineState > tq_pipe_softmax = nil ;
6868static id <MTLComputePipelineState > tq_pipe_attn_qk = nil ;
6969static id <MTLComputePipelineState > tq_pipe_attn_v = nil ;
70+ static id <MTLComputePipelineState > tq_pipe_kv_cache_write = nil ;
7071
7172/* Cached pipelines — fused MoE kernels */
7273static id <MTLComputePipelineState > tq_pipe_moe_gate_up = nil ;
@@ -441,6 +442,7 @@ int tq_init_metal_backend(void) {
441442 tq_pipe_gelu_tanh = makePipe (@" gelu_tanh" );
442443 tq_pipe_softmax = makePipe (@" softmax_inplace" );
443444 tq_pipe_attn_qk = makePipe (@" attention_qk" );
445+ tq_pipe_kv_cache_write = makePipe (@" kv_cache_write" );
444446 tq_pipe_attn_v = makePipe (@" attention_v" );
445447
446448 /* Create IQ2_S codebook buffer (shared by matmul and MoE kernels) */
@@ -2060,22 +2062,31 @@ int tq_metal_forward_layer(
20602062 /* Upload x to GPU (unified memory — just memcpy to shared buffer) */
20612063 memcpy ([g_gpu_x contents ], x, (size_t )dim * sizeof (float ));
20622064
2063- /* Upload KV cache for positions [0..pos] to GPU.
2064- * On Apple Silicon, key_cache is in unified memory so this is fast.
2065- * We upload the full cache slice — GPU attention needs all positions. */
2066- size_t cache_bytes = (size_t )(pos + 1 ) * kv_dim * sizeof (float );
2067- memcpy ([g_gpu_key_cache contents ], key_cache, cache_bytes);
2068- memcpy ([g_gpu_val_cache contents ], value_cache, cache_bytes);
2065+ /* Zero-copy KV cache: wrap CPU cache pointers as Metal buffers.
2066+ * Apple Silicon unified memory means no data copy needed.
2067+ * The GPU reads/writes the same physical memory as CPU. */
2068+ size_t cache_total = (size_t )seq_len * kv_dim * sizeof (float );
2069+ if (cache_total == 0 ) cache_total = (size_t )kv_dim * sizeof (float );
2070+ id <MTLBuffer > kc_buf = [tq_mtl_device newBufferWithBytesNoCopy: key_cache
2071+ length: cache_total
2072+ options: MTLResourceStorageModeShared
2073+ deallocator: nil ];
2074+ id <MTLBuffer > vc_buf = [tq_mtl_device newBufferWithBytesNoCopy: value_cache
2075+ length: cache_total
2076+ options: MTLResourceStorageModeShared
2077+ deallocator: nil ];
2078+ if (!kc_buf || !vc_buf) return -1 ;
20692079
20702080 /* Weight norm buffers (zero-copy) */
20712081 id <MTLBuffer > attn_norm_buf = tq_get_weight_buffer (w_attn_norm, (size_t )dim * sizeof (float ));
20722082 id <MTLBuffer > ffn_norm_buf = tq_get_weight_buffer (w_ffn_norm, (size_t )dim * sizeof (float ));
20732083 if (!attn_norm_buf || !ffn_norm_buf) return -1 ;
20742084
2075- /* ---- Create ONE command buffer + ONE encoder ---- */
2085+ /* ===== ONE command buffer, ONE encoder, ONE commit =====
2086+ * All operations encoded sequentially with memory barriers.
2087+ * GPU executes the entire layer pipeline without CPU sync. */
20762088 id <MTLCommandBuffer > cmdBuf = [tq_mtl_queue commandBuffer ];
20772089 if (!cmdBuf) return -1 ;
2078-
20792090 id <MTLComputeCommandEncoder > enc = [cmdBuf computeCommandEncoder ];
20802091 if (!enc) return -1 ;
20812092
@@ -2090,104 +2101,75 @@ int tq_metal_forward_layer(
20902101 /* ---- Step 3: RoPE on Q and K ---- */
20912102 encode_rope (enc, g_gpu_q, g_gpu_k, pos, head_dim, n_heads, n_kv_heads, rope_base);
20922103
2093- /* ---- Step 4: Store K,V to cache position, then attention ----
2094- * We need to copy Q's K and V into the cache at position pos.
2095- * Since the encoder is running on GPU, we use a blit-like approach:
2096- * write K and V at offset pos*kv_dim in the cache buffers.
2097- * We can do this with add_vectors(cache_pos = 0 + k) trick,
2098- * but simpler: endEncoding, blit, re-encode. Even better: the cache
2099- * was already uploaded, we just need to update position pos. */
2100-
2101- /* End encoder to do the cache write via CPU (unified memory means
2102- * the GPU buffer contents pointer is CPU-accessible after GPU completes).
2103- * But we want zero sync! Alternative: use a tiny copy kernel.
2104- * For now: use memcpy into the shared buffer directly before commit.
2105- * The encoder hasn't committed yet, so GPU hasn't started.
2106- * Writes to shared memory before commit are visible to GPU. */
2107- [enc endEncoding ];
2108-
2109- /* Write K,V at position pos in cache buffers (CPU write to shared memory
2110- * is visible to GPU because command buffer hasn't been committed yet) */
2104+ /* ---- Step 4: Write K,V to cache ON GPU (no CPU sync!) ---- */
21112105 {
2112- /* We need the Q,K,V results from GPU first. But GPU hasn't run yet!
2113- * Solution: commit this batch, wait, then do attention in a second batch.
2114- * This is still 2 commits per layer instead of N, a big improvement.
2115- *
2116- * Alternative: pre-upload K,V into cache before attention.
2117- * The K,V from the projection are only available after GPU runs.
2118- * So we must split into Phase A (projection + RoPE) and Phase B (attention + FFN). */
2119-
2120- [cmdBuf commit ];
2121- [cmdBuf waitUntilCompleted ];
2122- if (cmdBuf.status == MTLCommandBufferStatusError ) {
2123- NSLog (@" TurboQuant: GPU graph Phase A error: %@ " , cmdBuf.error );
2124- return -1 ;
2125- }
2106+ id <MTLBuffer > pos_buf = tq_get_dim_buffer ((uint32_t )pos);
2107+ id <MTLBuffer > kvd_buf = tq_get_dim_buffer ((uint32_t )kv_dim);
2108+
2109+ /* Write K to cache */
2110+ [enc setComputePipelineState: tq_pipe_kv_cache_write];
2111+ [enc setBuffer: kc_buf offset: 0 atIndex: 0 ];
2112+ [enc setBuffer: g_gpu_k offset: 0 atIndex: 1 ];
2113+ [enc setBuffer: pos_buf offset: 0 atIndex: 2 ];
2114+ [enc setBuffer: kvd_buf offset: 0 atIndex: 3 ];
2115+ [enc dispatchThreads: MTLSizeMake (kv_dim, 1 , 1 )
2116+ threadsPerThreadgroup: MTLSizeMake (MIN (kv_dim, 256 ), 1 , 1 )];
2117+ [enc memoryBarrierWithScope: MTLBarrierScopeBuffers ];
21262118
2127- /* Copy K,V results into cache (GPU buffer → cache GPU buffer) */
2128- float * gpu_k_ptr = (float *)[g_gpu_k contents ];
2129- float * gpu_v_ptr = (float *)[g_gpu_v contents ];
2130- float * kc_ptr = (float *)[g_gpu_key_cache contents ];
2131- float * vc_ptr = (float *)[g_gpu_val_cache contents ];
2132- memcpy (kc_ptr + pos * kv_dim, gpu_k_ptr, (size_t )kv_dim * sizeof (float ));
2133- memcpy (vc_ptr + pos * kv_dim, gpu_v_ptr, (size_t )kv_dim * sizeof (float ));
2134-
2135- /* Also write back to CPU KV cache for future layers / positions */
2136- memcpy (key_cache + pos * kv_dim, gpu_k_ptr, (size_t )kv_dim * sizeof (float ));
2137- memcpy (value_cache + pos * kv_dim, gpu_v_ptr, (size_t )kv_dim * sizeof (float ));
2119+ /* Write V to cache */
2120+ [enc setBuffer: vc_buf offset: 0 atIndex: 0 ];
2121+ [enc setBuffer: g_gpu_v offset: 0 atIndex: 1 ];
2122+ [enc dispatchThreads: MTLSizeMake (kv_dim, 1 , 1 )
2123+ threadsPerThreadgroup: MTLSizeMake (MIN (kv_dim, 256 ), 1 , 1 )];
2124+ [enc memoryBarrierWithScope: MTLBarrierScopeBuffers ];
21382125 }
21392126
2140- /* ---- Phase B: Attention + O-proj + FFN (single commit) ---- */
2141- id <MTLCommandBuffer > cmdBuf2 = [tq_mtl_queue commandBuffer ];
2142- if (!cmdBuf2) return -1 ;
2143- id <MTLComputeCommandEncoder > enc2 = [cmdBuf2 computeCommandEncoder ];
2144- if (!enc2) return -1 ;
2145-
2146- /* Attention scores: Q * K^T for all positions */
2127+ /* ---- Step 5: Attention (reads from GPU KV cache directly) ---- */
21472128 int attn_seq_len = pos + 1 ;
2148- encode_attn_qk (enc2, g_gpu_q, g_gpu_key_cache, g_gpu_att,
2129+ /* Attention uses same encoder — single command buffer! */
2130+ encode_attn_qk (enc, g_gpu_q, kc_buf, g_gpu_att,
21492131 head_dim, attn_seq_len, n_heads, n_kv_heads, kv_dim);
21502132
21512133 /* Softmax over attention scores per head */
2152- encode_softmax (enc2 , g_gpu_att, n_heads, attn_seq_len);
2134+ encode_softmax (enc , g_gpu_att, n_heads, attn_seq_len);
21532135
21542136 /* Weighted sum of values → xb (reuse xb for attention output) */
2155- encode_attn_v (enc2 , g_gpu_att, g_gpu_val_cache , g_gpu_xb,
2137+ encode_attn_v (enc , g_gpu_att, vc_buf , g_gpu_xb,
21562138 head_dim, attn_seq_len, n_heads, n_kv_heads, kv_dim);
21572139
21582140 /* ---- Step 5: Output projection (xb → xb2) ---- */
2159- encode_q4_matmul (enc2 , g_gpu_xb, g_gpu_xb2, wo_qs, wo_sc, dim, q_dim);
2141+ encode_q4_matmul (enc , g_gpu_xb, g_gpu_xb2, wo_qs, wo_sc, dim, q_dim);
21602142
21612143 /* ---- Step 6: Residual add (x += xb2) ---- */
2162- encode_add_inplace (enc2 , g_gpu_x, g_gpu_xb2, dim);
2144+ encode_add_inplace (enc , g_gpu_x, g_gpu_xb2, dim);
21632145
21642146 /* ---- Step 7: Pre-FFN RMSNorm(x → xb) ---- */
2165- encode_rmsnorm (enc2 , g_gpu_x, ffn_norm_buf, g_gpu_xb, dim, rms_eps);
2147+ encode_rmsnorm (enc , g_gpu_x, ffn_norm_buf, g_gpu_xb, dim, rms_eps);
21662148
21672149 /* ---- Step 8: FFN gate + up projections ---- */
2168- encode_q4_matmul (enc2 , g_gpu_xb, g_gpu_hb, wg_qs, wg_sc, inter_dim, dim);
2169- encode_q4_matmul (enc2 , g_gpu_xb, g_gpu_hb2, wu_qs, wu_sc, inter_dim, dim);
2150+ encode_q4_matmul (enc , g_gpu_xb, g_gpu_hb, wg_qs, wg_sc, inter_dim, dim);
2151+ encode_q4_matmul (enc , g_gpu_xb, g_gpu_hb2, wu_qs, wu_sc, inter_dim, dim);
21702152
21712153 /* ---- Step 9: Activation + gate multiply ---- */
21722154 if (use_gelu) {
2173- encode_gelu_tanh (enc2 , g_gpu_hb, inter_dim);
2155+ encode_gelu_tanh (enc , g_gpu_hb, inter_dim);
21742156 } else {
2175- encode_silu (enc2 , g_gpu_hb, g_gpu_hb, inter_dim);
2157+ encode_silu (enc , g_gpu_hb, g_gpu_hb, inter_dim);
21762158 }
2177- encode_mul (enc2 , g_gpu_hb, g_gpu_hb2, g_gpu_hb, inter_dim);
2159+ encode_mul (enc , g_gpu_hb, g_gpu_hb2, g_gpu_hb, inter_dim);
21782160
21792161 /* ---- Step 10: Down projection (hb → xb2) ---- */
2180- encode_q4_matmul (enc2 , g_gpu_hb, g_gpu_xb2, wd_qs, wd_sc, dim, inter_dim);
2162+ encode_q4_matmul (enc , g_gpu_hb, g_gpu_xb2, wd_qs, wd_sc, dim, inter_dim);
21812163
21822164 /* ---- Step 11: Residual add (x += xb2) ---- */
2183- encode_add_inplace (enc2 , g_gpu_x, g_gpu_xb2, dim);
2165+ encode_add_inplace (enc , g_gpu_x, g_gpu_xb2, dim);
21842166
2185- [enc2 endEncoding ];
2186- [cmdBuf2 commit ];
2187- [cmdBuf2 waitUntilCompleted ];
2167+ [enc endEncoding ];
2168+ [cmdBuf commit ];
2169+ [cmdBuf waitUntilCompleted ];
21882170
2189- if (cmdBuf2 .status == MTLCommandBufferStatusError ) {
2190- NSLog (@" TurboQuant: GPU graph Phase B error: %@ " , cmdBuf2 .error );
2171+ if (cmdBuf .status == MTLCommandBufferStatusError ) {
2172+ NSLog (@" TurboQuant: GPU graph Phase B error: %@ " , cmdBuf .error );
21912173 return -1 ;
21922174 }
21932175
0 commit comments