PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 1 addition & 0 deletions b/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 53 additions & 0 deletions b/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/noaux_tc.cu‎
Lines changed: 3 additions & 0 deletions b/‎custom_ops/gpu_ops/noaux_tc.cu‎
Lines changed: 3 additions & 0 deletions
@@ -564,6 +564,7 @@ std::vector<paddle::Tensor> NoauxTc(
       int n_group,
       int topk_group,
       int topk,
+      bool renormalize,
       float routed_scaling_factor);
 
 #ifdef ENABLE_FP8
 
@@ -151,6 +151,34 @@ inline int GetGPUComputeCapability(int id) {
 
 #endif
 
+#ifndef FP8_E4M3_MAX
+#define FP8_E4M3_MAX 448.0
+#endif
+
+#ifndef DISPATCH_FLOAT_FP6_DTYPE
+#define DISPATCH_FLOAT_FP6_DTYPE(pd_dtype, c_type, ...)           \
+    switch (pd_dtype) {                                           \
+      case phi::DataType::FLOAT32: {                           \
+        using c_type = float;                                  \
+        __VA_ARGS__                                            \
+        break;                                                 \
+      }                                                        \
+      case phi::DataType::BFLOAT16: {                          \
+        using c_type = phi::dtype::bfloat16;                   \
+        __VA_ARGS__                                            \
+        break;                                                 \
+      }                                                        \
+      case phi::DataType::FLOAT16: {                          \
+        using c_type = phi::dtype::float16;                 \
+        __VA_ARGS__                                            \
+        break;                                                 \
+      }                                                        \
+      default: {                                               \
+        PD_THROW("Only supported attr of input type in [fp32, fp16, bf16].");  \
+      }                                                        \
+    }
+#endif
+
 inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1)
     return num;
@@ -563,3 +591,28 @@ inline int GetSMVersion() {
   return sm_version;
 
 }
+
+__device__ __forceinline__ float warpReduceMax(float value) {
+  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16));
+  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8));
+  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 4));
+  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 2));
+  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 1));
+  return value;
+}
+
+__device__ __forceinline__ float blockReduceMax(float value) {
+  static __shared__ float warpLevelMaxs[WARP_SIZE];
+  const int laneId = threadIdx.x % WARP_SIZE;
+  const int warpId = threadIdx.x / WARP_SIZE;
+
+  value = warpReduceMax(value);
+
+  if (laneId == 0) warpLevelMaxs[warpId] = value;
+  __syncthreads();
+
+  value = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelMaxs[laneId] : 0;
+  if (warpId == 0) value = warpReduceMax(value);
+
+  return value;
+}
@@ -26,6 +26,7 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
                                     int n_group,
                                     int topk_group,
                                     int topk,
+                                    bool renormalize,
                                     float routed_scaling_factor) {
   auto input_shape = scores_with_bias.shape();
   PD_CHECK(input_shape.size() == 2);
@@ -48,6 +49,7 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
                        n_group,
                        topk_group,
                        topk,
+                       renormalize,
                        routed_scaling_factor,
                        stream);
 
@@ -76,6 +78,7 @@ PD_BUILD_STATIC_OP(noaux_tc)
     .Attrs({"n_group: int",
             "topk_group: int",
             "topk:int",
+            "renormalize: bool",
             "routed_scaling_factor: float"})
     .SetKernelFn(PD_KERNEL(NoauxTc))
     .SetInferShapeFn(PD_INFER_SHAPE(NoauxTcInferShape))