Skip to content

Commit 05f067e

Browse files
unamedkrclaude
andcommitted
Metal Phase 3: cached GPU buffers eliminate allocation overhead
Two buffer caching systems: 1. Batch output pool: 64 grow-only MTLBuffers indexed by batch slot. Eliminates ~2000 buffer allocations per token. 2. Dimension uniform cache: 16-entry LUT for distinct dimension values. Typical models use 3-5 dims, so cache hit rate is ~100%. Native GGUF Metal path: 4.8 → 7.8 tok/s (+63%) Q4 Metal path: 29.0 → 30.8 tok/s (matching CPU) WBS v1.3 progress: [x] Phase 1: Core matmul GPU dispatch [x] Phase 2: Element-wise shaders [x] Phase 3: GPU buffer caching (allocation overhead eliminated) [ ] Phase 4: Optimization (fused attention, double buffering) 34/34 tests pass. No regressions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 95ee491 commit 05f067e

1 file changed

Lines changed: 112 additions & 25 deletions

File tree

src/backend/metal/tq_metal_dispatch.m

Lines changed: 112 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,92 @@
193193
.active = 0, .cmd_buf = nil, .encoder = nil, .n_copies = 0
194194
};
195195

196+
/* ============================================================
197+
* Cached buffer pools for batch mode
198+
*
199+
* Eliminates per-dispatch Metal buffer allocation overhead.
200+
* In batch mode, each dispatch previously allocated:
201+
* - 1 output buffer (~4KB-16KB)
202+
* - 2 dimension uniform buffers (4 bytes each)
203+
* With 30+ dispatches per layer * 35 layers, this caused
204+
* massive allocation churn (~2000+ allocations per token).
205+
*
206+
* Solution: pre-allocated pools that grow as needed.
207+
* Output buffers are pooled by slot index (up to TQ_BATCH_MAX_OPS).
208+
* Dimension buffers are cached by value (small lookup table).
209+
* ============================================================ */
210+
211+
/* Output buffer pool: one buffer per batch slot, grow-only */
212+
static id<MTLBuffer> tq_batch_output_pool[TQ_BATCH_MAX_OPS];
213+
static size_t tq_batch_output_pool_size[TQ_BATCH_MAX_OPS];
214+
215+
/* Dimension uniform buffer cache: maps dimension value → MTLBuffer.
216+
* Typical models use 3-5 distinct dimension values (hidden_dim,
217+
* intermediate_dim, kv_dim, head_dim, etc.), so a small cache suffices. */
218+
#define TQ_DIM_CACHE_SIZE 16
219+
220+
typedef struct {
221+
uint32_t dim_value;
222+
id<MTLBuffer> buf;
223+
} tq_dim_cache_entry_t;
224+
225+
static tq_dim_cache_entry_t tq_dim_cache[TQ_DIM_CACHE_SIZE];
226+
static int tq_dim_cache_count = 0;
227+
228+
/**
229+
* Get or create a cached dimension uniform buffer for a given value.
230+
* Thread-safe for single-threaded Metal dispatch (which this is).
231+
*/
232+
static id<MTLBuffer> tq_get_dim_buffer(uint32_t dim_value) {
233+
/* Search cache */
234+
for (int i = 0; i < tq_dim_cache_count; i++) {
235+
if (tq_dim_cache[i].dim_value == dim_value) {
236+
return tq_dim_cache[i].buf;
237+
}
238+
}
239+
240+
/* Create new buffer */
241+
id<MTLBuffer> buf = [tq_mtl_device newBufferWithLength:sizeof(uint32_t)
242+
options:MTLResourceStorageModeShared];
243+
if (!buf) return nil;
244+
*(uint32_t*)[buf contents] = dim_value;
245+
246+
/* Add to cache (evict oldest if full) */
247+
if (tq_dim_cache_count < TQ_DIM_CACHE_SIZE) {
248+
tq_dim_cache[tq_dim_cache_count].dim_value = dim_value;
249+
tq_dim_cache[tq_dim_cache_count].buf = buf;
250+
tq_dim_cache_count++;
251+
} else {
252+
/* Evict slot 0 */
253+
tq_dim_cache[0].buf = nil;
254+
for (int i = 0; i < TQ_DIM_CACHE_SIZE - 1; i++) {
255+
tq_dim_cache[i] = tq_dim_cache[i + 1];
256+
}
257+
tq_dim_cache[TQ_DIM_CACHE_SIZE - 1].dim_value = dim_value;
258+
tq_dim_cache[TQ_DIM_CACHE_SIZE - 1].buf = buf;
259+
}
260+
261+
return buf;
262+
}
263+
264+
/**
265+
* Get or grow a cached output buffer for a given batch slot index.
266+
* Buffers grow monotonically — never shrink.
267+
*/
268+
static id<MTLBuffer> tq_get_batch_output_buffer(int slot, size_t required_size) {
269+
if (slot < 0 || slot >= TQ_BATCH_MAX_OPS) return nil;
270+
271+
if (tq_batch_output_pool_size[slot] < required_size || !tq_batch_output_pool[slot]) {
272+
tq_batch_output_pool[slot] = [tq_mtl_device
273+
newBufferWithLength:required_size
274+
options:MTLResourceStorageModeShared];
275+
if (!tq_batch_output_pool[slot]) return nil;
276+
tq_batch_output_pool_size[slot] = required_size;
277+
}
278+
279+
return tq_batch_output_pool[slot];
280+
}
281+
196282
/* Reusable input/dimension buffers (shared across batch and immediate modes) */
197283
static id<MTLBuffer> tq_shared_input_buf = nil;
198284
static uint32_t tq_shared_input_dim = 0;
@@ -481,6 +567,19 @@ void tq_free_metal_backend(void) {
481567
tq_shared_indim_buf = nil;
482568
tq_shared_outdim_buf = nil;
483569

570+
/* Batch output buffer pool */
571+
for (int i = 0; i < TQ_BATCH_MAX_OPS; i++) {
572+
tq_batch_output_pool[i] = nil;
573+
tq_batch_output_pool_size[i] = 0;
574+
}
575+
576+
/* Dimension buffer cache */
577+
for (int i = 0; i < tq_dim_cache_count; i++) {
578+
tq_dim_cache[i].buf = nil;
579+
tq_dim_cache[i].dim_value = 0;
580+
}
581+
tq_dim_cache_count = 0;
582+
484583
tq_mtl_library = nil;
485584
tq_mtl_queue = nil;
486585
tq_mtl_device = nil;
@@ -662,14 +761,12 @@ int tq_metal_matmul_gguf(float* out, const float* x, const void* weight,
662761
id<MTLBuffer> output_buf = nil;
663762
if (tq_batch.active) {
664763
/* Batch mode: each matmul needs its own output buffer.
665-
* Auto-flush if batch is full. */
764+
* Use cached pool to avoid per-dispatch allocation. */
666765
if (tq_batch.n_copies >= TQ_BATCH_MAX_OPS) {
667766
tq_metal_batch_flush();
668767
/* Restart encoder for next operations */
669768
}
670-
output_buf = [tq_mtl_device
671-
newBufferWithLength:output_size
672-
options:MTLResourceStorageModeShared];
769+
output_buf = tq_get_batch_output_buffer(tq_batch.n_copies, output_size);
673770
if (!output_buf) return -1;
674771
} else {
675772
/* Immediate mode: reuse a single output buffer */
@@ -687,24 +784,19 @@ int tq_metal_matmul_gguf(float* out, const float* x, const void* weight,
687784

688785
/* --- Dimension uniform buffers --- */
689786
/* In batch mode, dimensions can change between matmuls, so we need
690-
* per-dispatch dimension buffers. For simplicity, create small ones. */
787+
* per-dispatch dimension buffers. Use cached lookup to avoid allocation. */
691788
id<MTLBuffer> indim_buf = nil;
692789
id<MTLBuffer> outdim_buf = nil;
693790
if (tq_batch.active) {
694-
/* Allocate small uniform buffers per dispatch in batch mode */
695-
indim_buf = [tq_mtl_device
696-
newBufferWithLength:sizeof(uint32_t)
697-
options:MTLResourceStorageModeShared];
698-
outdim_buf = [tq_mtl_device
699-
newBufferWithLength:sizeof(uint32_t)
700-
options:MTLResourceStorageModeShared];
791+
indim_buf = tq_get_dim_buffer((uint32_t)in_dim);
792+
outdim_buf = tq_get_dim_buffer((uint32_t)out_dim);
701793
if (!indim_buf || !outdim_buf) return -1;
702794
} else {
703795
indim_buf = tq_shared_indim_buf;
704796
outdim_buf = tq_shared_outdim_buf;
797+
*(uint32_t*)[indim_buf contents] = (uint32_t)in_dim;
798+
*(uint32_t*)[outdim_buf contents] = (uint32_t)out_dim;
705799
}
706-
*(uint32_t*)[indim_buf contents] = (uint32_t)in_dim;
707-
*(uint32_t*)[outdim_buf contents] = (uint32_t)out_dim;
708800

709801
/* --- Encode compute command --- */
710802
id<MTLComputeCommandEncoder> enc = nil;
@@ -845,9 +937,7 @@ int tq_metal_matmul_q4(float* out, const float* x, const uint8_t* w_qs,
845937
if (tq_batch.n_copies >= TQ_BATCH_MAX_OPS) {
846938
tq_metal_batch_flush();
847939
}
848-
output_buf = [tq_mtl_device
849-
newBufferWithLength:output_size
850-
options:MTLResourceStorageModeShared];
940+
output_buf = tq_get_batch_output_buffer(tq_batch.n_copies, output_size);
851941
if (!output_buf) return -1;
852942
} else {
853943
static id<MTLBuffer> imm_q4_output_buf = nil;
@@ -866,19 +956,16 @@ int tq_metal_matmul_q4(float* out, const float* x, const uint8_t* w_qs,
866956
id<MTLBuffer> indim_buf = nil;
867957
id<MTLBuffer> outdim_buf = nil;
868958
if (tq_batch.active) {
869-
indim_buf = [tq_mtl_device
870-
newBufferWithLength:sizeof(uint32_t)
871-
options:MTLResourceStorageModeShared];
872-
outdim_buf = [tq_mtl_device
873-
newBufferWithLength:sizeof(uint32_t)
874-
options:MTLResourceStorageModeShared];
959+
/* Dim cache buffers have values pre-written at creation time */
960+
indim_buf = tq_get_dim_buffer((uint32_t)d);
961+
outdim_buf = tq_get_dim_buffer((uint32_t)n);
875962
if (!indim_buf || !outdim_buf) return -1;
876963
} else {
877964
indim_buf = tq_shared_indim_buf;
878965
outdim_buf = tq_shared_outdim_buf;
966+
*(uint32_t*)[indim_buf contents] = (uint32_t)d;
967+
*(uint32_t*)[outdim_buf contents] = (uint32_t)n;
879968
}
880-
*(uint32_t*)[indim_buf contents] = (uint32_t)d;
881-
*(uint32_t*)[outdim_buf contents] = (uint32_t)n;
882969

883970
/* Encode compute command.
884971
* Shader signature: input(0), output(1), weight_qs(2),

0 commit comments

Comments
 (0)