193193 .active = 0 , .cmd_buf = nil , .encoder = nil , .n_copies = 0
194194};
195195
196+ /* ============================================================
197+ * Cached buffer pools for batch mode
198+ *
199+ * Eliminates per-dispatch Metal buffer allocation overhead.
200+ * In batch mode, each dispatch previously allocated:
201+ * - 1 output buffer (~4KB-16KB)
202+ * - 2 dimension uniform buffers (4 bytes each)
203+ * With 30+ dispatches per layer * 35 layers, this caused
204+ * massive allocation churn (~2000+ allocations per token).
205+ *
206+ * Solution: pre-allocated pools that grow as needed.
207+ * Output buffers are pooled by slot index (up to TQ_BATCH_MAX_OPS).
208+ * Dimension buffers are cached by value (small lookup table).
209+ * ============================================================ */
210+
211+ /* Output buffer pool: one buffer per batch slot, grow-only */
212+ static id <MTLBuffer > tq_batch_output_pool[TQ_BATCH_MAX_OPS];
213+ static size_t tq_batch_output_pool_size[TQ_BATCH_MAX_OPS];
214+
215+ /* Dimension uniform buffer cache: maps dimension value → MTLBuffer.
216+ * Typical models use 3-5 distinct dimension values (hidden_dim,
217+ * intermediate_dim, kv_dim, head_dim, etc.), so a small cache suffices. */
218+ #define TQ_DIM_CACHE_SIZE 16
219+
220+ typedef struct {
221+ uint32_t dim_value;
222+ id <MTLBuffer > buf;
223+ } tq_dim_cache_entry_t ;
224+
225+ static tq_dim_cache_entry_t tq_dim_cache[TQ_DIM_CACHE_SIZE];
226+ static int tq_dim_cache_count = 0 ;
227+
228+ /* *
229+ * Get or create a cached dimension uniform buffer for a given value.
230+ * Thread-safe for single-threaded Metal dispatch (which this is).
231+ */
232+ static id <MTLBuffer > tq_get_dim_buffer (uint32_t dim_value) {
233+ /* Search cache */
234+ for (int i = 0 ; i < tq_dim_cache_count; i++) {
235+ if (tq_dim_cache[i].dim_value == dim_value) {
236+ return tq_dim_cache[i].buf ;
237+ }
238+ }
239+
240+ /* Create new buffer */
241+ id <MTLBuffer > buf = [tq_mtl_device newBufferWithLength: sizeof (uint32_t )
242+ options: MTLResourceStorageModeShared ];
243+ if (!buf) return nil ;
244+ *(uint32_t *)[buf contents ] = dim_value;
245+
246+ /* Add to cache (evict oldest if full) */
247+ if (tq_dim_cache_count < TQ_DIM_CACHE_SIZE) {
248+ tq_dim_cache[tq_dim_cache_count].dim_value = dim_value;
249+ tq_dim_cache[tq_dim_cache_count].buf = buf;
250+ tq_dim_cache_count++;
251+ } else {
252+ /* Evict slot 0 */
253+ tq_dim_cache[0 ].buf = nil ;
254+ for (int i = 0 ; i < TQ_DIM_CACHE_SIZE - 1 ; i++) {
255+ tq_dim_cache[i] = tq_dim_cache[i + 1 ];
256+ }
257+ tq_dim_cache[TQ_DIM_CACHE_SIZE - 1 ].dim_value = dim_value;
258+ tq_dim_cache[TQ_DIM_CACHE_SIZE - 1 ].buf = buf;
259+ }
260+
261+ return buf;
262+ }
263+
264+ /* *
265+ * Get or grow a cached output buffer for a given batch slot index.
266+ * Buffers grow monotonically — never shrink.
267+ */
268+ static id <MTLBuffer > tq_get_batch_output_buffer (int slot, size_t required_size) {
269+ if (slot < 0 || slot >= TQ_BATCH_MAX_OPS) return nil ;
270+
271+ if (tq_batch_output_pool_size[slot] < required_size || !tq_batch_output_pool[slot]) {
272+ tq_batch_output_pool[slot] = [tq_mtl_device
273+ newBufferWithLength: required_size
274+ options: MTLResourceStorageModeShared ];
275+ if (!tq_batch_output_pool[slot]) return nil ;
276+ tq_batch_output_pool_size[slot] = required_size;
277+ }
278+
279+ return tq_batch_output_pool[slot];
280+ }
281+
196282/* Reusable input/dimension buffers (shared across batch and immediate modes) */
197283static id <MTLBuffer > tq_shared_input_buf = nil ;
198284static uint32_t tq_shared_input_dim = 0 ;
@@ -481,6 +567,19 @@ void tq_free_metal_backend(void) {
481567 tq_shared_indim_buf = nil ;
482568 tq_shared_outdim_buf = nil ;
483569
570+ /* Batch output buffer pool */
571+ for (int i = 0 ; i < TQ_BATCH_MAX_OPS; i++) {
572+ tq_batch_output_pool[i] = nil ;
573+ tq_batch_output_pool_size[i] = 0 ;
574+ }
575+
576+ /* Dimension buffer cache */
577+ for (int i = 0 ; i < tq_dim_cache_count; i++) {
578+ tq_dim_cache[i].buf = nil ;
579+ tq_dim_cache[i].dim_value = 0 ;
580+ }
581+ tq_dim_cache_count = 0 ;
582+
484583 tq_mtl_library = nil ;
485584 tq_mtl_queue = nil ;
486585 tq_mtl_device = nil ;
@@ -662,14 +761,12 @@ int tq_metal_matmul_gguf(float* out, const float* x, const void* weight,
662761 id <MTLBuffer > output_buf = nil ;
663762 if (tq_batch.active ) {
664763 /* Batch mode: each matmul needs its own output buffer.
665- * Auto-flush if batch is full . */
764+ * Use cached pool to avoid per-dispatch allocation . */
666765 if (tq_batch.n_copies >= TQ_BATCH_MAX_OPS) {
667766 tq_metal_batch_flush ();
668767 /* Restart encoder for next operations */
669768 }
670- output_buf = [tq_mtl_device
671- newBufferWithLength: output_size
672- options: MTLResourceStorageModeShared ];
769+ output_buf = tq_get_batch_output_buffer (tq_batch.n_copies , output_size);
673770 if (!output_buf) return -1 ;
674771 } else {
675772 /* Immediate mode: reuse a single output buffer */
@@ -687,24 +784,19 @@ int tq_metal_matmul_gguf(float* out, const float* x, const void* weight,
687784
688785 /* --- Dimension uniform buffers --- */
689786 /* In batch mode, dimensions can change between matmuls, so we need
690- * per-dispatch dimension buffers. For simplicity, create small ones . */
787+ * per-dispatch dimension buffers. Use cached lookup to avoid allocation . */
691788 id <MTLBuffer > indim_buf = nil ;
692789 id <MTLBuffer > outdim_buf = nil ;
693790 if (tq_batch.active ) {
694- /* Allocate small uniform buffers per dispatch in batch mode */
695- indim_buf = [tq_mtl_device
696- newBufferWithLength: sizeof (uint32_t )
697- options: MTLResourceStorageModeShared ];
698- outdim_buf = [tq_mtl_device
699- newBufferWithLength: sizeof (uint32_t )
700- options: MTLResourceStorageModeShared ];
791+ indim_buf = tq_get_dim_buffer ((uint32_t )in_dim);
792+ outdim_buf = tq_get_dim_buffer ((uint32_t )out_dim);
701793 if (!indim_buf || !outdim_buf) return -1 ;
702794 } else {
703795 indim_buf = tq_shared_indim_buf;
704796 outdim_buf = tq_shared_outdim_buf;
797+ *(uint32_t *)[indim_buf contents ] = (uint32_t )in_dim;
798+ *(uint32_t *)[outdim_buf contents ] = (uint32_t )out_dim;
705799 }
706- *(uint32_t *)[indim_buf contents ] = (uint32_t )in_dim;
707- *(uint32_t *)[outdim_buf contents ] = (uint32_t )out_dim;
708800
709801 /* --- Encode compute command --- */
710802 id <MTLComputeCommandEncoder > enc = nil ;
@@ -845,9 +937,7 @@ int tq_metal_matmul_q4(float* out, const float* x, const uint8_t* w_qs,
845937 if (tq_batch.n_copies >= TQ_BATCH_MAX_OPS) {
846938 tq_metal_batch_flush ();
847939 }
848- output_buf = [tq_mtl_device
849- newBufferWithLength: output_size
850- options: MTLResourceStorageModeShared ];
940+ output_buf = tq_get_batch_output_buffer (tq_batch.n_copies , output_size);
851941 if (!output_buf) return -1 ;
852942 } else {
853943 static id <MTLBuffer > imm_q4_output_buf = nil ;
@@ -866,19 +956,16 @@ int tq_metal_matmul_q4(float* out, const float* x, const uint8_t* w_qs,
866956 id <MTLBuffer > indim_buf = nil ;
867957 id <MTLBuffer > outdim_buf = nil ;
868958 if (tq_batch.active ) {
869- indim_buf = [tq_mtl_device
870- newBufferWithLength: sizeof (uint32_t )
871- options: MTLResourceStorageModeShared ];
872- outdim_buf = [tq_mtl_device
873- newBufferWithLength: sizeof (uint32_t )
874- options: MTLResourceStorageModeShared ];
959+ /* Dim cache buffers have values pre-written at creation time */
960+ indim_buf = tq_get_dim_buffer ((uint32_t )d);
961+ outdim_buf = tq_get_dim_buffer ((uint32_t )n);
875962 if (!indim_buf || !outdim_buf) return -1 ;
876963 } else {
877964 indim_buf = tq_shared_indim_buf;
878965 outdim_buf = tq_shared_outdim_buf;
966+ *(uint32_t *)[indim_buf contents ] = (uint32_t )d;
967+ *(uint32_t *)[outdim_buf contents ] = (uint32_t )n;
879968 }
880- *(uint32_t *)[indim_buf contents ] = (uint32_t )d;
881- *(uint32_t *)[outdim_buf contents ] = (uint32_t )n;
882969
883970 /* Encode compute command.
884971 * Shader signature: input(0), output(1), weight_qs(2),
0 commit comments