fix: resolve undefined symbol and MoE dispatch crash

da.huo · da.huo · commit 789a6cb018f9 · 2026-04-15T20:30:03.000+08:00
CMakeLists.txt:
  Move tma.cu from gemm2 into GEMM2_KERNELS_SM90, so make_2d_tma_desc
  resides in the same archive (libgemm2_sm90.a) as its SM90 CUTLASS
  callers. This fixes the undefined symbol error caused by single-pass
  static-link ordering between libgemm2.a and libgemm2_sm90.a.

LlamaLinear.cu:
  Guard invokeMoeDispatchScales with `if (U)`. The is_cublas_grouped
  path (SM100 bf16 MoE) enters the dispatch block without quantization,
  leaving the scales tensor U empty. Calling invokeMoeDispatchScales on
  an empty tensor crashes with std::out_of_range on B200.
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -16,6 +16,7 @@ set(GEMM2_KERNELS_SM80
     kernel/sm80_16816_16.cu
 )
 set(GEMM2_KERNELS_SM90
+    tma.cu
     kernel/sm90_16816_4.cu
     kernel/sm90_16816_8.cu
     kernel/sm90_16816_16.cu
@@ -49,7 +50,6 @@ add_library(gemm2
         cast.cu
         unpack.cu
         context.cu
-        tma.cu
         tuner/cache_utils.cu
         tuner/measurer.cu
         tuner/sampler.cu
@@ -85,11 +85,8 @@ set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
 if(GEMM2_ARCH_90_ENABLED)
-    # SM90 kernels compile only for sm_90 (CUTLASS wgmma instructions).
-    # tma.cu is duplicated here (also in gemm2) so make_2d_tma_desc lives in the
-    # same archive as its only callers (kernel_impl_sm90.h), avoiding the undefined
-    # symbol from single-pass static-link ordering between two archives.
-    add_library(gemm2_sm90 STATIC ${GEMM2_KERNELS_SM90} tma.cu)
+    # SM90 kernels only compile for 90/90a; avoid building them for sm_100.
+    add_library(gemm2_sm90 STATIC ${GEMM2_KERNELS_SM90})
     set_target_properties(gemm2_sm90 PROPERTIES
         CUDA_ARCHITECTURES "${_sm90_archs}"
         POSITION_INDEPENDENT_CODE ON
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
@@ -89,11 +89,13 @@ struct LlamaLinear::Impl {
             Tensor    A_e       = {{m, k}, A.dtype(), kDEVICE};
             invokeMoeDispatch(A_e, A, indices.data(), e, st);
             sync_check_cuda_error();
-            Tensor U_e;
-            invokeMoeDispatchScales(U_e, U, indices.data(), e, st);
-            sync_check_cuda_error();
+            if (U) {
+                Tensor U_e;
+                invokeMoeDispatchScales(U_e, U, indices.data(), e, st);
+                sync_check_cuda_error();
+                U = U_e;
+            }
             A       = A_e;
-            U       = U_e;
             indices = {};  // indices already applied
         }