Skip to content

Commit 789a6cb

Browse files
author
da.huo
committed
fix: resolve undefined symbol and MoE dispatch crash
CMakeLists.txt: Move tma.cu from gemm2 into GEMM2_KERNELS_SM90, so make_2d_tma_desc resides in the same archive (libgemm2_sm90.a) as its SM90 CUTLASS callers. This fixes the undefined symbol error caused by single-pass static-link ordering between libgemm2.a and libgemm2_sm90.a. LlamaLinear.cu: Guard invokeMoeDispatchScales with `if (U)`. The is_cublas_grouped path (SM100 bf16 MoE) enters the dispatch block without quantization, leaving the scales tensor U empty. Calling invokeMoeDispatchScales on an empty tensor crashes with std::out_of_range on B200.
1 parent b101a61 commit 789a6cb

2 files changed

Lines changed: 9 additions & 10 deletions

File tree

src/turbomind/kernels/gemm/CMakeLists.txt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ set(GEMM2_KERNELS_SM80
1616
kernel/sm80_16816_16.cu
1717
)
1818
set(GEMM2_KERNELS_SM90
19+
tma.cu
1920
kernel/sm90_16816_4.cu
2021
kernel/sm90_16816_8.cu
2122
kernel/sm90_16816_16.cu
@@ -49,7 +50,6 @@ add_library(gemm2
4950
cast.cu
5051
unpack.cu
5152
context.cu
52-
tma.cu
5353
tuner/cache_utils.cu
5454
tuner/measurer.cu
5555
tuner/sampler.cu
@@ -85,11 +85,8 @@ set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
8585
set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
8686

8787
if(GEMM2_ARCH_90_ENABLED)
88-
# SM90 kernels compile only for sm_90 (CUTLASS wgmma instructions).
89-
# tma.cu is duplicated here (also in gemm2) so make_2d_tma_desc lives in the
90-
# same archive as its only callers (kernel_impl_sm90.h), avoiding the undefined
91-
# symbol from single-pass static-link ordering between two archives.
92-
add_library(gemm2_sm90 STATIC ${GEMM2_KERNELS_SM90} tma.cu)
88+
# SM90 kernels only compile for 90/90a; avoid building them for sm_100.
89+
add_library(gemm2_sm90 STATIC ${GEMM2_KERNELS_SM90})
9390
set_target_properties(gemm2_sm90 PROPERTIES
9491
CUDA_ARCHITECTURES "${_sm90_archs}"
9592
POSITION_INDEPENDENT_CODE ON

src/turbomind/models/llama/LlamaLinear.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,13 @@ struct LlamaLinear::Impl {
8989
Tensor A_e = {{m, k}, A.dtype(), kDEVICE};
9090
invokeMoeDispatch(A_e, A, indices.data(), e, st);
9191
sync_check_cuda_error();
92-
Tensor U_e;
93-
invokeMoeDispatchScales(U_e, U, indices.data(), e, st);
94-
sync_check_cuda_error();
92+
if (U) {
93+
Tensor U_e;
94+
invokeMoeDispatchScales(U_e, U, indices.data(), e, st);
95+
sync_check_cuda_error();
96+
U = U_e;
97+
}
9598
A = A_e;
96-
U = U_e;
9799
indices = {}; // indices already applied
98100
}
99101

0 commit comments

Comments
 (0)