fix(cuda): remove float4 alignment requirement from gemv_q8_kernel

dndungu · dndungu · commit 34aba3b9c3b3 · 2026-03-30T22:17:14.000-07:00
The gemv_q8_kernel cast the activation pointer x (float*) to float4*
for 16-byte vectorized loads into shared memory. When x is not 16-byte
aligned (common on ARM64/Grace Hopper with pool allocations), this
causes cudaMemcpy misaligned address errors.

Replace float4 global loads with per-element __ldg loads. Shared
memory float4 accesses are unaffected (shared memory is always
16-byte aligned). Performance impact: minimal -- the global-to-shared
load is a one-time cost per block, not in the inner loop.

Fixes: Gemma3 inference "misaligned address" on DGX Spark GB10.
Root cause confirmed via compute-sanitizer --tool memcheck.
diff --git a/internal/cuda/kernels/gemm_q8.cu b/internal/cuda/kernels/gemm_q8.cu
@@ -30,16 +30,14 @@ __global__ void gemv_q8_kernel(
 {
     extern __shared__ float sx[];
 
-    /* Cooperatively load x[0..K-1] into shared memory using float4 loads. */
+    /* Cooperatively load x[0..K-1] into shared memory.
+     * Use per-element loads instead of float4 to avoid misaligned access
+     * when the activation pointer x is not 16-byte aligned (common on
+     * ARM64/Grace Hopper when x comes from pool allocations with
+     * non-aligned offsets). Shared memory loads later in the kernel are
+     * always aligned since shared memory base is 16-byte aligned. */
     int threads_per_block = blockDim.x;
-    int k4 = K / 4;
-    const float4* x4 = (const float4*)x;
-    float4* sx4 = (float4*)sx;
-    for (int i = threadIdx.x; i < k4; i += threads_per_block) {
-        sx4[i] = __ldg(&x4[i]);
-    }
-    /* Handle remainder if K is not a multiple of 4. */
-    for (int i = k4 * 4 + threadIdx.x; i < K; i += threads_per_block) {
+    for (int i = threadIdx.x; i < K; i += threads_per_block) {
         sx[i] = __ldg(&x[i]);
     }
     __syncthreads();