fix(compute): use dequant+cuBLAS for Q4_K when K%256!=0

dndungu · dndungu · commit 5f21cbbbb91d · 2026-04-01T11:44:09.000-07:00
Q4_K GEMV requires K to be a multiple of 256 (super-block size). For
models where hidden_size is not 256-aligned (e.g., Gemma3-1B with
hidden_size=1152, 1152%256=128), all Q4_K matmuls fell back to CPU.

Remove the hard k%256!=0 → CPU fallback. Instead, only use the GEMV
fast path when k%256==0, and fall through to the dequant+cuBLAS path
(DequantQ4KF32 + SgemmNT) for unaligned K. The dequant kernel handles
ceil(K/256) super-blocks, and cuBLAS handles any dimensions.
diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go
@@ -1439,11 +1439,6 @@ func (e *GPUEngine[T]) matMulQ4K(ctx context.Context, qs *tensor.Q4KStorage, a,
 	k := aShape[1]
 	n := bShape[1]
 
-	// K must be a multiple of 256 for Q4_K super-blocks.
-	if k%256 != 0 {
-		return e.cpu.MatMul(ctx, a, b, dst...)
-	}
-
 	e.setDevice()
 
 	// Get Q4_K device pointer (pre-uploaded or upload now).
@@ -1467,8 +1462,8 @@ func (e *GPUEngine[T]) matMulQ4K(ctx context.Context, qs *tensor.Q4KStorage, a,
 	}
 	defer freeW()
 
-	// Fused GEMV path: y = dequant(W_q4k) * x, when n==1.
-	if n == 1 {
+	// Fused GEMV path: y = dequant(W_q4k) * x, when n==1 and K is 256-aligned.
+	if n == 1 && k%256 == 0 {
 		devX, cleanupX, err := getDevicePtr(e, b)
 		if err != nil {
 			return e.cpu.MatMul(ctx, a, b, dst...)
@@ -1554,11 +1549,6 @@ func (e *GPUEngine[T]) matMulQ4KBWeight(ctx context.Context, a *tensor.TensorNum
 	}
 	n := bShape[1] // columns of B (after virtual transpose)
 
-	// K must be a multiple of 256 for Q4_K super-blocks.
-	if k%256 != 0 {
-		return e.cpu.MatMul(ctx, a, b, dst...)
-	}
-
 	// Build output shape: [batch..., m_last, n].
 	outShape := make([]int, len(aShape))
 	copy(outShape, aShape[:len(aShape)-1])
@@ -1589,7 +1579,9 @@ func (e *GPUEngine[T]) matMulQ4KBWeight(ctx context.Context, a *tensor.TensorNum
 	defer freeQ4K()
 
 	// Fused GEMV path: y[n] = sum_k dequant(B_q4k[n, k]) * x[k], when m==1.
-	if m == 1 {
+	// Requires K % 256 == 0 for Q4_K super-block alignment.
+	// When K is not aligned, falls through to the general dequant+cuBLAS path.
+	if m == 1 && k%256 == 0 {
 		devX, cleanupX, err := getDevicePtr(e, a)
 		if err != nil {
 			return e.cpu.MatMul(ctx, a, b, dst...)