fix(compute): reuse dst GPU memory instead of allocating per call (#84)

dndungu · dndungu · commit 26bbd49de27b · 2026-04-09T22:28:36.000-07:00
GPU ops (gpuBinaryOp, gpuUnaryOp, gpuScalarOp, Transpose, MatMul, Sum) were allocating fresh device memory via pool.Alloc on every call even when a pre-sized dst tensor was provided, then swapping dst's storage to the new allocation. The old GPUStorage was orphaned and depended on Go's GC finalizer to call pool.Free. At large training shapes with hundreds of batches and ~20 ops per batch, orphaned allocations piled up faster than the GC could reclaim, causing unbounded GPU memory growth and OOM. Fix: add tryReuseDstPtr helper that checks if dst[0] already has a GPUStorage with sufficient capacity. If so, the kernel writes directly into the existing device pointer — no pool.Alloc, no orphaned storage, no GC pressure. When dst is nil or undersized, the existing alloc path is preserved unchanged. Applied to the six hot-path op families that cover PatchTST GPU training: - gpuBinaryOp (Add, Sub, Mul same-shape) - gpuUnaryOp (Exp, Log, Sin, Cos, Tanh, Sqrt) - gpuScalarOp (MulScalar, AddScalar, DivScalar) - Transpose (gpu_engine_memory.go) - MatMul standard float32 path (gpu_engine.go) - Sum/ReduceSum (gpu_kernels.go) Other ops (broadcast, Q4/Q8/BF16 matmul, fused kernels) continue using the existing alloc path and can be converted incrementally. Full ztensor test suite passes on CPU host. Closes #84 Refs zerfoo/zerfoo#373
diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go
@@ -978,13 +978,16 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T]
 		return nil, err
 	}
 
-	// Allocate device output.
-	devCTotal, err := e.pool.Alloc(e.deviceID, outputBytes)
-	if err != nil {
-		e.oomFallbackCount.Add(1)
-		e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error())
+	// Reuse dst's existing GPU memory when possible (#84).
+	devCTotal, reusedC := tryReuseDstPtr[T](batchSize*cMatSize, dst)
+	if !reusedC {
+		devCTotal, err = e.pool.Alloc(e.deviceID, outputBytes)
+		if err != nil {
+			e.oomFallbackCount.Add(1)
+			e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error())
 
-		return e.cpu.MatMul(ctx, a, b, dst...)
+			return e.cpu.MatMul(ctx, a, b, dst...)
+		}
 	}
 
 	// Use strided batched GEMM when available for float32 with batch > 1.
@@ -1013,9 +1016,14 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T]
 			if err := batched.SgemmStridedBatched(m, n, k, 1.0,
 				devA, strideA, devB, strideBVal, 0.0,
 				devCTotal, strideC, batchSize); err != nil {
-				e.pool.Free(e.deviceID, devCTotal, outputBytes)
+				if !reusedC {
+					e.pool.Free(e.deviceID, devCTotal, outputBytes)
+				}
 				return nil, fmt.Errorf("MatMul: batched GEMM: %w", err)
 			}
+			if reusedC {
+				return finishReusedDst[T](dst[0], outShape), nil
+			}
 			return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...)
 		}
 	}
@@ -1052,12 +1060,17 @@ func (e *GPUEngine[T]) MatMul(ctx context.Context, a, b *tensor.TensorNumeric[T]
 		}
 
 		if blasErr != nil {
-			e.pool.Free(e.deviceID, devCTotal, outputBytes)
+			if !reusedC {
+				e.pool.Free(e.deviceID, devCTotal, outputBytes)
+			}
 
 			return nil, fmt.Errorf("MatMul: BLAS batch %d: %w", batch, blasErr)
 		}
 	}
 
+	if reusedC {
+		return finishReusedDst[T](dst[0], outShape), nil
+	}
 	return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...)
 }
 
diff --git a/compute/gpu_engine_memory.go b/compute/gpu_engine_memory.go
@@ -132,9 +132,14 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T]
 	}
 
 	byteSize := total * f32Size
-	devOut, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return e.cpu.Transpose(ctx, a, axes, dst...)
+
+	// Reuse dst's existing GPU memory when possible (#84).
+	devOut, reused := tryReuseDstPtr[T](total, dst)
+	if !reused {
+		devOut, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return e.cpu.Transpose(ctx, a, axes, dst...)
+		}
 	}
 
 	// Fast path: 2D transpose.
@@ -145,9 +150,14 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T]
 				"cols", fmt.Sprintf("%d", shape[1]))
 		}
 		if err := e.kernels.Transpose2D(devIn, devOut, shape[0], shape[1], e.stream); err != nil {
-			e.pool.Free(e.deviceID, devOut, byteSize)
+			if !reused {
+				e.pool.Free(e.deviceID, devOut, byteSize)
+			}
 			return nil, err
 		}
+		if reused {
+			return finishReusedDst[T](dst[0], outShape), nil
+		}
 		return makeGPUResult[T](e, outShape, devOut, total, dst...)
 	}
 
@@ -175,10 +185,15 @@ func (e *GPUEngine[T]) Transpose(ctx context.Context, a *tensor.TensorNumeric[T]
 	}
 
 	if err := e.kernels.TransposeND(devIn, devOut, inStrides32, outStrides32, perm32, rank, total, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devOut, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devOut, byteSize)
+		}
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], outShape), nil
+	}
 	return makeGPUResult[T](e, outShape, devOut, total, dst...)
 }
 
diff --git a/compute/gpu_kernels.go b/compute/gpu_kernels.go
@@ -115,6 +115,35 @@ func getDevicePtr[T tensor.Numeric](e *GPUEngine[T], t *tensor.TensorNumeric[T])
 	return devPtr, cleanup, nil
 }
 
+// tryReuseDstPtr checks whether dst[0] already has a GPUStorage with at least
+// neededElems capacity. If so, it returns the existing device pointer so the
+// caller can write kernel output directly into it, avoiding a pool.Alloc and
+// the resulting GC-pressure from orphaned GPUStorage objects. See ztensor#84.
+func tryReuseDstPtr[T tensor.Numeric](neededElems int, dst []*tensor.TensorNumeric[T]) (unsafe.Pointer, bool) {
+	if len(dst) == 0 || dst[0] == nil {
+		return nil, false
+	}
+	gs, ok := dst[0].GetStorage().(*tensor.GPUStorage[T])
+	if !ok || gs.Len() < neededElems {
+		return nil, false
+	}
+	return gs.Ptr(), true
+}
+
+// finishReusedDst updates dst's shape and strides in place after a kernel has
+// written into dst's existing device memory. No new GPUStorage is created.
+func finishReusedDst[T tensor.Numeric](dst *tensor.TensorNumeric[T], shape []int) *tensor.TensorNumeric[T] {
+	strides := make([]int, len(shape))
+	stride := 1
+	for i := len(shape) - 1; i >= 0; i-- {
+		strides[i] = stride
+		stride *= shape[i]
+	}
+	dst.SetShape(shape)
+	dst.SetStrides(strides)
+	return dst
+}
+
 // makeGPUResult creates a tensor with pool-backed GPUStorage wrapping the given
 // device pointer. When the tensor is freed, the pointer is returned to the pool
 // for reuse instead of calling cudaFree.
@@ -522,17 +551,26 @@ func gpuBinaryOp[T tensor.Numeric](
 
 	byteSize := n * f32Size
 
-	devC, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return nil, err
+	// Reuse dst's existing GPU memory when possible (#84).
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	if err := kernelFn(devA, devB, devC, n, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devC, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
 	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
 }
 
@@ -559,17 +597,26 @@ func gpuUnaryOp[T tensor.Numeric](
 
 	byteSize := n * f32Size
 
-	devC, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return nil, err
+	// Reuse dst's existing GPU memory when possible (#84).
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	if err := kernelFn(devA, devC, n, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devC, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
 	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
 }
 
@@ -597,17 +644,26 @@ func gpuScalarOp[T tensor.Numeric](
 
 	byteSize := n * f32Size
 
-	devC, err := e.pool.Alloc(e.deviceID, byteSize)
-	if err != nil {
-		return nil, err
+	// Reuse dst's existing GPU memory when possible (#84).
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	if err := kernelFn(devA, scalar, devC, n, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devC, byteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
 	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
 }
 
@@ -957,20 +1013,29 @@ func (e *GPUEngine[T]) gpuSum(ctx context.Context, a *tensor.TensorNumeric[T], a
 
 	outByteSize := numStripes * f32Size
 
-	devOut, err := e.pool.Alloc(e.deviceID, outByteSize)
-	if err != nil {
-		e.oomFallbackCount.Add(1)
-		e.logger.Warn("Sum: GPU output alloc failed, falling back to CPU", "error", err.Error())
+	// Reuse dst's existing GPU memory when possible (#84).
+	devOut, reused := tryReuseDstPtr[T](numStripes, dst)
+	if !reused {
+		devOut, err = e.pool.Alloc(e.deviceID, outByteSize)
+		if err != nil {
+			e.oomFallbackCount.Add(1)
+			e.logger.Warn("Sum: GPU output alloc failed, falling back to CPU", "error", err.Error())
 
-		return e.cpu.Sum(ctx, a, axis, keepDims, dst...)
+			return e.cpu.Sum(ctx, a, axis, keepDims, dst...)
+		}
 	}
 
 	if err := e.kernels.SumAxis(devIn, devOut, outer, inner, axisSize, e.stream); err != nil {
-		e.pool.Free(e.deviceID, devOut, outByteSize)
+		if !reused {
+			e.pool.Free(e.deviceID, devOut, outByteSize)
+		}
 
 		return nil, err
 	}
 
+	if reused {
+		return finishReusedDst[T](dst[0], newShape), nil
+	}
 	return makeGPUResult[T](e, newShape, devOut, numStripes, dst...)
 }
 

Original file line number	Diff line number	Diff line change
`@@ -978,13 +978,16 @@ func (e GPUEngine[T]) MatMul(ctx context.Context, a, b tensor.TensorNumeric[T]`
`978`	`978`	`return nil, err`
`979`	`979`	`}`
`980`	`980`
`981`		`- // Allocate device output.`
`982`		`- devCTotal, err := e.pool.Alloc(e.deviceID, outputBytes)`
`983`		`- if err != nil {`
`984`		`- e.oomFallbackCount.Add(1)`
`985`		`- e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error())`
	`981`	`+ // Reuse dst's existing GPU memory when possible (#84).`
	`982`	`+ devCTotal, reusedC := tryReuseDstPtr[T](batchSize*cMatSize, dst)`
	`983`	`+ if !reusedC {`
	`984`	`+ devCTotal, err = e.pool.Alloc(e.deviceID, outputBytes)`
	`985`	`+ if err != nil {`
	`986`	`+ e.oomFallbackCount.Add(1)`
	`987`	`+ e.logger.Warn("MatMul: GPU output alloc failed, falling back to CPU", "error", err.Error())`
`986`	`988`
`987`		`- return e.cpu.MatMul(ctx, a, b, dst...)`
	`989`	`+ return e.cpu.MatMul(ctx, a, b, dst...)`
	`990`	`+ }`
`988`	`991`	`}`
`989`	`992`
`990`	`993`	`// Use strided batched GEMM when available for float32 with batch > 1.`
`@@ -1013,9 +1016,14 @@ func (e GPUEngine[T]) MatMul(ctx context.Context, a, b tensor.TensorNumeric[T]`
`1013`	`1016`	`if err := batched.SgemmStridedBatched(m, n, k, 1.0,`
`1014`	`1017`	`devA, strideA, devB, strideBVal, 0.0,`
`1015`	`1018`	`devCTotal, strideC, batchSize); err != nil {`
`1016`		`- e.pool.Free(e.deviceID, devCTotal, outputBytes)`
	`1019`	`+ if !reusedC {`
	`1020`	`+ e.pool.Free(e.deviceID, devCTotal, outputBytes)`
	`1021`	`+ }`
`1017`	`1022`	`return nil, fmt.Errorf("MatMul: batched GEMM: %w", err)`
`1018`	`1023`	`}`
	`1024`	`+ if reusedC {`
	`1025`	`+ return finishReusedDst[T](dst[0], outShape), nil`
	`1026`	`+ }`
`1019`	`1027`	`return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...)`
`1020`	`1028`	`}`
`1021`	`1029`	`}`
`@@ -1052,12 +1060,17 @@ func (e GPUEngine[T]) MatMul(ctx context.Context, a, b tensor.TensorNumeric[T]`
`1052`	`1060`	`}`
`1053`	`1061`
`1054`	`1062`	`if blasErr != nil {`
`1055`		`- e.pool.Free(e.deviceID, devCTotal, outputBytes)`
	`1063`	`+ if !reusedC {`
	`1064`	`+ e.pool.Free(e.deviceID, devCTotal, outputBytes)`
	`1065`	`+ }`
`1056`	`1066`
`1057`	`1067`	`return nil, fmt.Errorf("MatMul: BLAS batch %d: %w", batch, blasErr)`
`1058`	`1068`	`}`
`1059`	`1069`	`}`
`1060`	`1070`
	`1071`	`+ if reusedC {`
	`1072`	`+ return finishReusedDst[T](dst[0], outShape), nil`
	`1073`	`+ }`
`1061`	`1074`	`return makeGPUResult[T](e, outShape, devCTotal, batchSize*cMatSize, dst...)`
`1062`	`1075`	`}`
`1063`	`1076`
Original file line number	Diff line number	Diff line change
`@@ -132,9 +132,14 @@ func (e GPUEngine[T]) Transpose(ctx context.Context, a tensor.TensorNumeric[T]`
`132`	`132`	`}`
`133`	`133`
`134`	`134`	`byteSize := total * f32Size`
`135`		`- devOut, err := e.pool.Alloc(e.deviceID, byteSize)`
`136`		`- if err != nil {`
`137`		`- return e.cpu.Transpose(ctx, a, axes, dst...)`
	`135`	`+`
	`136`	`+ // Reuse dst's existing GPU memory when possible (#84).`
	`137`	`+ devOut, reused := tryReuseDstPtr[T](total, dst)`
	`138`	`+ if !reused {`
	`139`	`+ devOut, err = e.pool.Alloc(e.deviceID, byteSize)`
	`140`	`+ if err != nil {`
	`141`	`+ return e.cpu.Transpose(ctx, a, axes, dst...)`
	`142`	`+ }`
`138`	`143`	`}`
`139`	`144`
`140`	`145`	`// Fast path: 2D transpose.`
`@@ -145,9 +150,14 @@ func (e GPUEngine[T]) Transpose(ctx context.Context, a tensor.TensorNumeric[T]`
`145`	`150`	`"cols", fmt.Sprintf("%d", shape[1]))`
`146`	`151`	`}`
`147`	`152`	`if err := e.kernels.Transpose2D(devIn, devOut, shape[0], shape[1], e.stream); err != nil {`
`148`		`- e.pool.Free(e.deviceID, devOut, byteSize)`
	`153`	`+ if !reused {`
	`154`	`+ e.pool.Free(e.deviceID, devOut, byteSize)`
	`155`	`+ }`
`149`	`156`	`return nil, err`
`150`	`157`	`}`
	`158`	`+ if reused {`
	`159`	`+ return finishReusedDst[T](dst[0], outShape), nil`
	`160`	`+ }`
`151`	`161`	`return makeGPUResult[T](e, outShape, devOut, total, dst...)`
`152`	`162`	`}`
`153`	`163`
`@@ -175,10 +185,15 @@ func (e GPUEngine[T]) Transpose(ctx context.Context, a tensor.TensorNumeric[T]`
`175`	`185`	`}`
`176`	`186`
`177`	`187`	`if err := e.kernels.TransposeND(devIn, devOut, inStrides32, outStrides32, perm32, rank, total, e.stream); err != nil {`
`178`		`- e.pool.Free(e.deviceID, devOut, byteSize)`
	`188`	`+ if !reused {`
	`189`	`+ e.pool.Free(e.deviceID, devOut, byteSize)`
	`190`	`+ }`
`179`	`191`	`return nil, err`
`180`	`192`	`}`
`181`	`193`
	`194`	`+ if reused {`
	`195`	`+ return finishReusedDst[T](dst[0], outShape), nil`
	`196`	`+ }`
`182`	`197`	`return makeGPUResult[T](e, outShape, devOut, total, dst...)`
`183`	`198`	`}`
`184`	`199`