Skip to content

Commit e21cd43

Browse files
unamedkrclaude
andcommitted
Fix #9: Vulkan clang build break + missing CUDA init in CLI
- tq_vulkan_init.c: include turboquant/tq_types.h and forward-declare tq_vulkan_override_traits() so clang accepts the file (GCC tolerated the implicit decl, clang -Werror=implicit-function-declaration did not). - tq_cuda_dispatch.cu: add tq_cuda_override_traits() symmetric to the Vulkan path, installing CUDA quantize/attention fns into TQ_TRAITS. - tools/quant.c: add the missing #ifdef TQ_BUILD_CUDA init block so -DTQ_BUILD_CUDA=ON actually dispatches KV ops to GPU instead of silently running CPU-only. - Both backends now print an explicit "GPU acceleration covers KV cache only; weight matmul runs on CPU" notice to match project non-goals and avoid the "GPU idle" surprise reported in #9. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e161d2e commit e21cd43

3 files changed

Lines changed: 43 additions & 0 deletions

File tree

src/backend/cuda/tq_cuda_dispatch.cu

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,4 +358,29 @@ extern "C" void* tq_cuda_get_attention_fn(int type_id) {
358358
return (void*)g_cuda_dispatch[type_id].attention;
359359
}
360360

361+
/* Override TQ_TRAITS with CUDA-accelerated KV quantize/attention functions.
362+
* Mirror of tq_vulkan_override_traits() — only KV cache ops are dispatched
363+
* to GPU; weight matmul still runs on CPU. */
364+
#include "turboquant/tq_types.h"
365+
extern "C" void tq_cuda_override_traits(void) {
366+
for (int i = 0; i < 7 && i < TQ_TYPE_COUNT; i++) {
367+
void* cu_quant = tq_cuda_get_quantize_fn(i);
368+
void* cu_attn = tq_cuda_get_attention_fn(i);
369+
if (cu_quant) {
370+
void (*fn)(const float*, void*, int);
371+
memcpy(&fn, &cu_quant, sizeof(fn));
372+
TQ_TRAITS[i].quantize = fn;
373+
fprintf(stderr, " CUDA: GPU-accelerated quantize for %s\n", TQ_TRAITS[i].name);
374+
}
375+
if (cu_attn) {
376+
void (*fn)(const float*, const void*, float*, int, int);
377+
memcpy(&fn, &cu_attn, sizeof(fn));
378+
TQ_TRAITS[i].attention = fn;
379+
fprintf(stderr, " CUDA: GPU-accelerated attention for %s\n", TQ_TRAITS[i].name);
380+
}
381+
}
382+
fprintf(stderr, "quant.cpp CUDA: GPU acceleration covers KV cache "
383+
"quantize/attention only; weight matmul runs on CPU.\n");
384+
}
385+
361386
#endif /* TQ_BUILD_CUDA */

src/backend/vulkan/tq_vulkan_init.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@
1010
#ifdef TQ_BUILD_VULKAN
1111

1212
#include "tq_vulkan.h"
13+
#include "turboquant/tq_types.h"
1314
#include <stdio.h>
1415
#include <stdlib.h>
1516
#include <string.h>
1617

18+
/* Forward decl: defined below, called from tq_init_vulkan_backend(). */
19+
void tq_vulkan_override_traits(void);
20+
1721
/* ============================================================
1822
* Global state
1923
* ============================================================ */
@@ -465,6 +469,8 @@ int tq_init_vulkan_backend(void) {
465469

466470
fprintf(stderr, "quant.cpp Vulkan: Initialized on %s (subgroup size %u)\n",
467471
g_vk_state.device_name, g_vk_state.subgroup_size);
472+
fprintf(stderr, "quant.cpp Vulkan: GPU acceleration covers KV cache "
473+
"quantize/attention only; weight matmul runs on CPU.\n");
468474

469475
g_vk_state.initialized = 1;
470476

tools/quant.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,18 @@ int main(int argc, char** argv) {
324324
}
325325
}
326326
#endif
327+
#ifdef TQ_BUILD_CUDA
328+
{
329+
extern int tq_init_cuda_backend(void);
330+
extern void tq_cuda_override_traits(void);
331+
if (tq_init_cuda_backend() == 0) {
332+
tq_cuda_override_traits();
333+
fprintf(stderr, "CUDA backend: ready (KV cache quantization on GPU)\n");
334+
} else {
335+
fprintf(stderr, "CUDA backend: init failed, falling back to CPU\n");
336+
}
337+
}
338+
#endif
327339

328340
if (info_only) {
329341
tq_free_model(model);

0 commit comments

Comments
 (0)