Fix #9: Vulkan clang build break + missing CUDA init in CLI

unamedkr · claude · unamedkr · commit e21cd43f0c7e · 2026-04-08T20:15:23.000+09:00
- tq_vulkan_init.c: include turboquant/tq_types.h and forward-declare tq_vulkan_override_traits() so clang accepts the file (GCC tolerated the implicit decl, clang -Werror=implicit-function-declaration did not). - tq_cuda_dispatch.cu: add tq_cuda_override_traits() symmetric to the Vulkan path, installing CUDA quantize/attention fns into TQ_TRAITS. - tools/quant.c: add the missing #ifdef TQ_BUILD_CUDA init block so -DTQ_BUILD_CUDA=ON actually dispatches KV ops to GPU instead of silently running CPU-only. - Both backends now print an explicit "GPU acceleration covers KV cache only; weight matmul runs on CPU" notice to match project non-goals and avoid the "GPU idle" surprise reported in #9. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/backend/cuda/tq_cuda_dispatch.cu b/src/backend/cuda/tq_cuda_dispatch.cu
@@ -358,4 +358,29 @@ extern "C" void* tq_cuda_get_attention_fn(int type_id) {
     return (void*)g_cuda_dispatch[type_id].attention;
 }
 
+/* Override TQ_TRAITS with CUDA-accelerated KV quantize/attention functions.
+ * Mirror of tq_vulkan_override_traits() — only KV cache ops are dispatched
+ * to GPU; weight matmul still runs on CPU. */
+#include "turboquant/tq_types.h"
+extern "C" void tq_cuda_override_traits(void) {
+    for (int i = 0; i < 7 && i < TQ_TYPE_COUNT; i++) {
+        void* cu_quant = tq_cuda_get_quantize_fn(i);
+        void* cu_attn  = tq_cuda_get_attention_fn(i);
+        if (cu_quant) {
+            void (*fn)(const float*, void*, int);
+            memcpy(&fn, &cu_quant, sizeof(fn));
+            TQ_TRAITS[i].quantize = fn;
+            fprintf(stderr, "  CUDA: GPU-accelerated quantize for %s\n", TQ_TRAITS[i].name);
+        }
+        if (cu_attn) {
+            void (*fn)(const float*, const void*, float*, int, int);
+            memcpy(&fn, &cu_attn, sizeof(fn));
+            TQ_TRAITS[i].attention = fn;
+            fprintf(stderr, "  CUDA: GPU-accelerated attention for %s\n", TQ_TRAITS[i].name);
+        }
+    }
+    fprintf(stderr, "quant.cpp CUDA: GPU acceleration covers KV cache "
+                    "quantize/attention only; weight matmul runs on CPU.\n");
+}
+
 #endif /* TQ_BUILD_CUDA */
diff --git a/src/backend/vulkan/tq_vulkan_init.c b/src/backend/vulkan/tq_vulkan_init.c
@@ -10,10 +10,14 @@
 #ifdef TQ_BUILD_VULKAN
 
 #include "tq_vulkan.h"
+#include "turboquant/tq_types.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+/* Forward decl: defined below, called from tq_init_vulkan_backend(). */
+void tq_vulkan_override_traits(void);
+
 /* ============================================================
  * Global state
  * ============================================================ */
@@ -465,6 +469,8 @@ int tq_init_vulkan_backend(void) {
 
     fprintf(stderr, "quant.cpp Vulkan: Initialized on %s (subgroup size %u)\n",
            g_vk_state.device_name, g_vk_state.subgroup_size);
+    fprintf(stderr, "quant.cpp Vulkan: GPU acceleration covers KV cache "
+                    "quantize/attention only; weight matmul runs on CPU.\n");
 
     g_vk_state.initialized = 1;
 
diff --git a/tools/quant.c b/tools/quant.c
@@ -324,6 +324,18 @@ int main(int argc, char** argv) {
         }
     }
 #endif
+#ifdef TQ_BUILD_CUDA
+    {
+        extern int  tq_init_cuda_backend(void);
+        extern void tq_cuda_override_traits(void);
+        if (tq_init_cuda_backend() == 0) {
+            tq_cuda_override_traits();
+            fprintf(stderr, "CUDA backend: ready (KV cache quantization on GPU)\n");
+        } else {
+            fprintf(stderr, "CUDA backend: init failed, falling back to CPU\n");
+        }
+    }
+#endif
 
     if (info_only) {
         tq_free_model(model);