[Cherry-Pick][OP] cherry-pick #7073 support deepgemm for sm103 (#7081)

BingooYang · web-flow · commit 14676a332c70 · 2026-04-01T21:12:51.000+08:00
diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -67,7 +67,7 @@ def __init__(self, weight_block_size: list = [-1, -1], is_checkpoint_bf16: bool
         self.quant_round_type = 1
         self.use_deep_gemm = bool(envs.FD_USE_DEEP_GEMM)
         self.is_checkpoint_bf16 = is_checkpoint_bf16
-        self.deepgemm_scale_ue8m0 = True if get_sm_version() == 100 else False
+        self.deepgemm_scale_ue8m0 = True if get_sm_version() >= 100 else False
 
     def name(self) -> str:
         return "block_wise_fp8"
diff --git a/fastdeploy/model_executor/layers/quantization/fp8_utils.py b/fastdeploy/model_executor/layers/quantization/fp8_utils.py
@@ -60,7 +60,7 @@ def load_deep_gemm():
     """
 
     if current_platform.is_cuda():
-        if get_sm_version() == 100:
+        if get_sm_version() >= 100:
             # SM100 should use PFCC DeepGemm
             paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
             try:
@@ -167,7 +167,7 @@ def fused_stack_transpose_quant(expert_weight_list, use_ue8m0=False):
         # Blackwell (SM100) GPUs require pow2_scale quantization.
         # Guard with is_cuda() so non-CUDA environments do not call into
         # paddle.device.cuda.* and cause a crash.
-        use_pow2_scale = current_platform.is_cuda() and get_sm_version() == 100
+        use_pow2_scale = current_platform.is_cuda() and get_sm_version() >= 100
 
         w, scale = paddlefleet_ops.fuse_stack_transpose_fp8_quant(
             expert_weight_list,