fix cmakelists and llamalinear

da.huo · da.huo · commit a3f28c32e3e4 · 2026-04-15T20:09:54.000+08:00
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -16,6 +16,7 @@ set(GEMM2_KERNELS_SM80
     kernel/sm80_16816_16.cu
 )
 set(GEMM2_KERNELS_SM90
+    tma.cu
     kernel/sm90_16816_4.cu
     kernel/sm90_16816_8.cu
     kernel/sm90_16816_16.cu
@@ -49,7 +50,6 @@ add_library(gemm2
         cast.cu
         unpack.cu
         context.cu
-        tma.cu
         tuner/cache_utils.cu
         tuner/measurer.cu
         tuner/sampler.cu
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
@@ -89,11 +89,13 @@ struct LlamaLinear::Impl {
             Tensor    A_e       = {{m, k}, A.dtype(), kDEVICE};
             invokeMoeDispatch(A_e, A, indices.data(), e, st);
             sync_check_cuda_error();
-            Tensor U_e;
-            invokeMoeDispatchScales(U_e, U, indices.data(), e, st);
-            sync_check_cuda_error();
+            if (U) {
+                Tensor U_e;
+                invokeMoeDispatchScales(U_e, U, indices.data(), e, st);
+                sync_check_cuda_error();
+                U = U_e;
+            }
             A       = A_e;
-            U       = U_e;
             indices = {};  // indices already applied
         }