Completed AMD HIP Support for UVW Tensor Products (#94)

vbharadwaj-bk · web-flow · commit 6be2be528537 · 2025-04-28T20:38:18.000-07:00
* Many configurations are working, but we are failing some simple tests.

* Updated benchmark.py script to remove keyword parameters.
diff --git a/README.md b/README.md
@@ -23,8 +23,8 @@ which has a closed-source kernel package. We also offer fused
 equivariant graph convolutions that can reduce 
 computation and memory consumption significantly. 
 
-We currently support NVIDIA GPUs and have just added beta support on AMD GPUs for
-UVU tensor products! See [the coverage table](#tensor-products-we-accelerate) for more 
+We currently support NVIDIA GPUs and just added beta support on AMD GPUs for
+all tensor products! See [the coverage table](#tensor-products-we-accelerate) for more 
 details.
 
 **Warning**: This is an early release, bug reports are welcome.
@@ -242,9 +242,9 @@ python tests/mace_driver.py carbon.xyz -o outputs/mace_tests -i e3nn cue oeq
 | Operation                | CUDA     | HIP |
 |--------------------------|----------|-----|
 | UVU Batch                | ✅        | ✅    |
-| UVW Batch                | ✅        | 🚧🔨  |
+| UVW Batch                | ✅        | ✅    |
 | UVU Convolution          | ✅        | ✅    |
-| UVW Convolution          | ✅        | 🚧🔨  |
+| UVW Convolution          | ✅        | ✅    |
 | Symmetric Tensor Product | ✅ (beta) | 🚧🔨  |
 
 e3nn supports a variety of connection modes for CG tensor products. We support 
diff --git a/openequivariance/extension/convolution.hpp b/openequivariance/extension/convolution.hpp
@@ -96,18 +96,28 @@ class __attribute__ ((visibility ("default"))) JITConvImpl : public ConvolutionI
 public:
     JIT_IMPL jit;
     KernelLaunchConfig forward_config; 
-    KernelLaunchConfig backward_config; 
+    KernelLaunchConfig backward_config;
+    bool is_uvw; 
 
     JITConvImpl(
         std::string jit_kernel,
         KernelLaunchConfig forward_config_i,
-        KernelLaunchConfig backward_config_i) :
+        KernelLaunchConfig backward_config_i,
+        bool is_uvw_i) :
             jit(jit_kernel),
             forward_config(forward_config_i),  
-            backward_config(backward_config_i) {
+            backward_config(backward_config_i),
+            is_uvw(is_uvw_i) {
 
         vector<string> kernels = {"forward", "backward", "fixup_forward", "fixup_backward"};
-        jit.compile(kernels, {{}, {}, {}, {}}); 
+
+        int opt_level = 3;
+        #ifdef HIP_BACKEND
+        if(is_uvw) {
+            opt_level = 1;
+        }
+        #endif 
+        jit.compile(kernels, {{}, {}, {}, {}}, opt_level); 
 
         if(forward_config.smem > 0) {
             jit.set_max_smem(0, forward_config.smem);
@@ -134,7 +144,8 @@ class __attribute__ ((visibility ("default"))) JITConvImpl : public ConvolutionI
                 bwd_dict["num_blocks"],
                 bwd_dict["num_threads"],
                 bwd_dict["smem"]
-            )) { } 
+            ),
+            kernel_dims["is_uvw"] == 1) { }
 
     void exec_conv(
             void* L1_in,
diff --git a/openequivariance/extension/tensorproducts.hpp b/openequivariance/extension/tensorproducts.hpp
@@ -54,18 +54,28 @@ class __attribute__ ((visibility ("default"))) JITTPImpl : public GenericTensorP
 public:
     JIT_IMPL jit;
     KernelLaunchConfig forward_config, backward_config, double_backward_config; 
+    bool is_uvw;
 
     JITTPImpl(
         std::string jit_kernel,
         KernelLaunchConfig forward_config_i,
         KernelLaunchConfig backward_config_i,
-        KernelLaunchConfig double_backward_config_i) :
+        KernelLaunchConfig double_backward_config_i,
+        bool is_uvw_i) :
             jit(jit_kernel),
             forward_config(forward_config_i),  
             backward_config(backward_config_i),
-            double_backward_config(double_backward_config_i) {
+            double_backward_config(double_backward_config_i),
+            is_uvw(is_uvw_i) {
         vector<string> kernels = {"forward", "backward", "double_backward_A", "double_backward_B"};
-        jit.compile(kernels, {{}, {}, {}, {}}); 
+
+        int opt_level = 3;
+        #ifdef HIP_BACKEND
+        if(is_uvw) {
+            opt_level = 1;
+        }
+        #endif 
+        jit.compile(kernels, {{}, {}, {}, {}}, opt_level); 
 
         if(forward_config.smem > 0) {
             jit.set_max_smem(0, forward_config.smem);
@@ -103,7 +113,8 @@ class __attribute__ ((visibility ("default"))) JITTPImpl : public GenericTensorP
                 dbl_bwd_dict["num_blocks"],
                 dbl_bwd_dict["num_threads"],
                 dbl_bwd_dict["smem"]
-            ) 
+            ),
+            kernel_dims["is_uvw"] == 1 
         ) { } 
 
     void exec_tensor_product(
diff --git a/openequivariance/extension/torch_tp_jit.cpp b/openequivariance/extension/torch_tp_jit.cpp
@@ -37,6 +37,14 @@ namespace py=pybind11;
 
 using Map_t=torch::Dict<string, int64_t>;
 
+std::unordered_map<string, int64_t> to_map(const Map_t &map) {
+    std::unordered_map<string, int64_t> result;
+    for(auto it = map.begin(); it != map.end(); ++it) {
+        result[it->key()] = it->value();
+    }
+    return result;
+}
+
 inline void* data_ptr(const torch::Tensor &tensor) {
     if(tensor.dtype() == torch::kFloat)
         return reinterpret_cast<void*>(tensor.data_ptr<float>());
@@ -62,21 +70,11 @@ class __attribute__ ((visibility ("default"))) TorchJITProduct : public torch::C
         dbl_bwd_dict(dbl_bwd_dict_i.copy()),
         kernel_dims(kernel_dims_i.copy()),
         internal(kernel_plaintext, 
-            KernelLaunchConfig(
-                fwd_dict.at("num_blocks"),
-                fwd_dict.at("num_threads"),
-                fwd_dict.at("smem")
+                to_map(fwd_dict_i),
+                to_map(bwd_dict_i),
+                to_map(dbl_bwd_dict_i),
+                to_map(kernel_dims_i)
             ),
-            KernelLaunchConfig(
-                bwd_dict.at("num_blocks"),
-                bwd_dict.at("num_threads"),
-                bwd_dict.at("smem")
-            ),
-            KernelLaunchConfig(
-                dbl_bwd_dict.at("num_blocks"),
-                dbl_bwd_dict.at("num_threads"),
-                dbl_bwd_dict.at("smem")
-            )),
         L3_dim(kernel_dims.at("L3_dim")),
         shared_weights(kernel_dims.at("shared_weights")) { }
 
@@ -225,17 +223,11 @@ class TorchJITConv : public torch::CustomClassHolder {
         fwd_dict(fwd_dict_i.copy()),
         bwd_dict(bwd_dict_i.copy()),
         kernel_dims(kernel_dims_i.copy()),
-        internal(kernel_plaintext, 
-            KernelLaunchConfig(
-                fwd_dict.at("num_blocks"),
-                fwd_dict.at("num_threads"),
-                fwd_dict.at("smem")
+        internal(kernel_plaintext,
+                to_map(fwd_dict_i),
+                to_map(bwd_dict_i),
+                to_map(kernel_dims_i)
             ),
-            KernelLaunchConfig(
-                bwd_dict.at("num_blocks"),
-                bwd_dict.at("num_threads"),
-                bwd_dict.at("smem")
-            )),
         L3_dim(kernel_dims.at("L3_dim")) { }
 
     tuple<tuple<string, string>, tuple<string, Map_t>, tuple<string, Map_t>, tuple<string, Map_t>> __obj_flatten__() {
diff --git a/openequivariance/extension/util/backend_cuda.hpp b/openequivariance/extension/util/backend_cuda.hpp
@@ -178,13 +178,13 @@ class __attribute__((visibility("default"))) CUJITKernel {
                             NULL));                    // includeNames
     }
 
-    void compile(string kernel_name, const vector<int> template_params) {
+    void compile(string kernel_name, const vector<int> template_params, int opt_level=3) {
         vector<string> kernel_names = {kernel_name};
         vector<vector<int>> template_param_list = {template_params};
         compile(kernel_names, template_param_list);
     }
 
-    void compile(vector<string> kernel_names_i, vector<vector<int>> template_param_list) {
+    void compile(vector<string> kernel_names_i, vector<vector<int>> template_param_list, int opt_level=3) {
         if(compiled) {
             throw std::logic_error("JIT object has already been compiled!");
         }
diff --git a/openequivariance/extension/util/backend_hip.hpp b/openequivariance/extension/util/backend_hip.hpp
@@ -173,13 +173,13 @@ class __attribute__((visibility("default"))) HIPJITKernel {
                             NULL));                    // includeNames
     }
 
-    void compile(string kernel_name, const vector<int> template_params) {
+    void compile(string kernel_name, const vector<int> template_params, int opt_level=3) {
         vector<string> kernel_names = {kernel_name};
         vector<vector<int>> template_param_list = {template_params};
-        compile(kernel_names, template_param_list);
+        compile(kernel_names, template_param_list, opt_level);
     }
 
-    void compile(vector<string> kernel_names_i, vector<vector<int>> template_param_list) {
+    void compile(vector<string> kernel_names_i, vector<vector<int>> template_param_list, int opt_level=3) {
         if(compiled) {
             throw std::logic_error("JIT object has already been compiled!");
         }
@@ -214,9 +214,11 @@ class __attribute__((visibility("default"))) HIPJITKernel {
         int device = 0;
         HIP_ERRCHK(hipGetDeviceProperties(&props, device));
         std::string sarg = std::string("--gpu-architecture=") + props.gcnArchName;  
+        std::string opt_arg = "-O" + std::to_string(opt_level);
 
         std::vector<const char*> opts = {
             "--std=c++17",
+            opt_arg.c_str(),
             sarg.c_str()
         }; 
 
diff --git a/openequivariance/implementations/ComputationSchedule.py b/openequivariance/implementations/ComputationSchedule.py
@@ -5,8 +5,6 @@
 from openequivariance.implementations.TensorProductBase import *
 logger = getLogger()
 
-# This class assumes a warp size of 32
-
 class IrrepMapping:
     '''
     Maps irreps from a source to a destination set.
@@ -265,9 +263,13 @@ def __init__(self,
         # Stream weights on the fly before pre-loading 
         self.stream_weights = stream_weights 
 
-        # Step 1: Break the irreps and the instructions into chunks of at most 32 x 32 x 32. 
+        # Step 1: Break the irreps and the instructions into chunks 
+
+        chunk_size = warp_size
+        if include_scratch: # There is at least one UVW computation if this flag is set. Cap the chunk size to 32. 
+            chunk_size = 32
 
-        self.problem_splitter = ProblemSplitter(config, warp_size)
+        self.problem_splitter = ProblemSplitter(config, chunk_size)
         self.updated_config = self.problem_splitter.output
         self.L1, self.L2, self.L3 = self.updated_config.irreps_in1, self.updated_config.irreps_in2, self.updated_config.irreps_out 
         self.new_instructions = self.updated_config.instructions
diff --git a/openequivariance/implementations/LoopUnrollTP.py b/openequivariance/implementations/LoopUnrollTP.py
@@ -67,7 +67,7 @@ def generate_double_backward_schedule(warps_per_block):
                     generate_schedule(warp_count)
                     break
                 except Exception as e:
-                    warp_count //= 2
+                    warp_count -= 2
                     if warp_count == 0:
                         raise RuntimeError("Tensor product schedule generation failed, shared memory inadequate!")
 
@@ -76,8 +76,8 @@ def generate_double_backward_schedule(warps_per_block):
             backward_schedule=self.backward_schedule,
             double_backward_schedule=self.double_backward_schedule))
 
-        with open("scratch.txt", "w") as f:
-            f.write(self.jit_kernel)
+        #with open("scratch.txt", "w") as f:
+        #    f.write(self.jit_kernel)
 
         internal_cls = None
         if self.torch_op and extlib.TORCH_COMPILE:
@@ -94,7 +94,8 @@ def generate_double_backward_schedule(warps_per_block):
                 vars(self.backward_schedule.launch_config),
                 vars(self.double_backward_schedule.launch_config), 
                 {"L3_dim": self.L3.dim,
-                 "shared_weights": int(self.config.shared_weights)})
+                 "shared_weights": int(self.config.shared_weights),
+                 "is_uvw": int(self.is_uvw)})
         logger.info("Kernel compiled!")
 
         logger.info(f"Kernel File Size: {len(self.jit_kernel) // 1024} KB")
diff --git a/openequivariance/implementations/convolution/LoopUnrollConv.py b/openequivariance/implementations/convolution/LoopUnrollConv.py
@@ -115,7 +115,8 @@ def generate_backward_schedule(warps_per_block):
         self.internal = internal_cls(self.jit_kernel,
                 vars(self.forward_schedule.launch_config), 
                 vars(self.backward_schedule.launch_config),
-                {"L3_dim": self.L3.dim})
+                {"L3_dim": self.L3.dim,
+                 "is_uvw": int(self.is_uvw)})
         logger.info("Kernel compiled!")
 
         self.reorder_weights_e3nn_to_oeq = lambda input, output, has_batch_dim: \
diff --git a/openequivariance/templates/loop_unroll_batch.cuh b/openequivariance/templates/loop_unroll_batch.cuh
@@ -18,7 +18,7 @@ using IRREP_T  = {{ forward_schedule.irrep_dtype_cstr }};
 using WEIGHT_T = {{ forward_schedule.weight_dtype_cstr }};
 
 {%- for i, segment in enumerate(forward_schedule.segments) %}
-{{ generate_segment_kernel_forward(i, segment) }}
+{{ generate_segment_kernel_forward(i, segment, forward_schedule.launch_config.warp_size) }}
 {%- endfor %}
 
 __global__ void forward(
diff --git a/openequivariance/templates/loop_unroll_conv_atomic.cuh b/openequivariance/templates/loop_unroll_conv_atomic.cuh
@@ -19,7 +19,7 @@ using IRREP_T  = {{ forward_schedule.irrep_dtype_cstr }};
 using WEIGHT_T = {{ forward_schedule.weight_dtype_cstr }};
 
 {%- for i, segment in enumerate(forward_schedule.segments) %}
-{{ generate_segment_kernel_forward(i, segment) }}
+{{ generate_segment_kernel_forward(i, segment, forward_schedule.launch_config.warp_size) }}
 {%- endfor %}
 
 struct ConvData {
diff --git a/openequivariance/templates/loop_unroll_conv_det.cuh b/openequivariance/templates/loop_unroll_conv_det.cuh
@@ -19,7 +19,7 @@ using IRREP_T  = {{ forward_schedule.irrep_dtype_cstr }};
 using WEIGHT_T = {{ forward_schedule.weight_dtype_cstr }};
 
 {%- for i, segment in enumerate(forward_schedule.segments) %}
-{{ generate_segment_kernel_forward(i, segment) }}
+{{ generate_segment_kernel_forward(i, segment, forward_schedule.launch_config.warp_size) }}
 {%- endfor %}
 
 struct ConvData {
diff --git a/openequivariance/templates/loop_unroll_tp.cuh b/openequivariance/templates/loop_unroll_tp.cuh
@@ -1,7 +1,7 @@
 {%- from 'macros.jinja' import transpose_load, transpose_store, reg_store with context %}
 {%- from 'wmm.cuh' import generate_matmul %}
 
-{%- macro generate_segment_kernel_forward(id, segment) %}
+{%- macro generate_segment_kernel_forward(id, segment, warp_size) %}
 {%- set L1, L2, L3, interactions, problem = segment.L1, segment.L2, segment.L3, segment.interactions, segment.problem %}
 
 {%- set L1_irrep_lengths = L1 | map(attribute="ir") | map(attribute="dim") | list %}
@@ -11,7 +11,7 @@
 {%- for i, inst in enumerate(problem.instructions) %}
     {%- set u, v, w, _ = interactions[i] %}
     {%- if inst.connection_mode == "uvw" %}
-        {{generate_matmul("matmul_fwd_%d_%d" % (id, i), L3[w].mul, L3[w].ir.dim, L1[u].mul, 4, True)}}
+        {{generate_matmul("matmul_fwd_%d_%d" % (id, i), L3[w].mul, L3[w].ir.dim, L1[u].mul, 4, True, warp_size)}}
     {%- endif %}
 {%- endfor %}
 
@@ -114,8 +114,8 @@ __device__ __forceinline__ void forward_loop_unroll_{{id}}(IRREP_T* __restrict__
 {%- for i, inst in enumerate(problem.instructions) %}
     {%- set u, v, w, _ = interactions[i] %}
     {%- if inst.connection_mode == "uvw" %}
-        {{generate_matmul(matmul_basename + "A_%d_%d" % (id, i), L1[u].mul, L3[w].ir.dim, L3[w].mul, 4, True, A_CMAJOR=False, accum=False)}}
-        {{generate_matmul(matmul_basename + "B_%d_%d" % (id, i), L3[w].mul, L1[u].mul, L3[w].ir.dim, 4, False, A_CMAJOR=False, accum=False)}}
+        {{generate_matmul(matmul_basename + "A_%d_%d" % (id, i), L1[u].mul, L3[w].ir.dim, L3[w].mul, 4, True, warp_size, A_CMAJOR=False, accum=False)}}
+        {{generate_matmul(matmul_basename + "B_%d_%d" % (id, i), L3[w].mul, L1[u].mul, L3[w].ir.dim, 4, False, warp_size, A_CMAJOR=False, accum=False)}}
     {%- endif %}
 {%- endfor %}
 
diff --git a/openequivariance/templates/macros.jinja b/openequivariance/templates/macros.jinja
@@ -32,7 +32,7 @@ Keys map to lists of tuples with (name, dtype, num_elements) of each subarray.
 {%- endmacro %}
 
 {# smem contains a mul_ir stored in row-major order as mul * rep, where mul
-   is at most 32. reg is at least a |rep|-sized register array on each thread.
+   is at most |warp_size|. reg is at least a |rep|-sized register array on each thread.
        Assumes: each thread has the lane_id. #}
 {%- macro transpose_load(mul, dim, smem, offset, reg) %}
     if(lane_id < {{mul}}) {
diff --git a/openequivariance/templates/wmm.cuh b/openequivariance/templates/wmm.cuh
@@ -1,11 +1,11 @@
-{%- macro generate_matmul(name, M, N, K, TILES_PER_ROW, OUTPUT_RMAJOR, A_CMAJOR=True, B_RMAJOR=True, accum=True) %}
+{%- macro generate_matmul(name, M, N, K, TILES_PER_ROW, OUTPUT_RMAJOR, warp_size, A_CMAJOR=True, B_RMAJOR=True, accum=True) %}
 
-{%-set TILES_PER_COL = 32 // TILES_PER_ROW %}
+{%-set TILES_PER_COL = warp_size // TILES_PER_ROW %}
 
 template<typename T> 
 __device__ __forceinline__ void {{name}}(const T* __restrict__ A, const T* __restrict__ B, T* C) {    
     int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
-    int lane_id = t_idx % 32;
+    int lane_id = t_idx % {{warp_size}};
 
     int const rpt = {{(M + TILES_PER_COL - 1) // TILES_PER_COL}};
     int const cpt = {{(N + TILES_PER_ROW - 1) // TILES_PER_ROW}};
diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -89,7 +89,6 @@ def benchmark_uvu(params):
                 or ('benzene' not in test.problem.label and test.problem.irrep_dtype != np.float64)]
 
     bench_suite = TestBenchmarkSuite(
-        correctness_threshold = 5e-5,
         num_warmup=100,
         num_iter=100,
         bench_batch_size=params.batch_size,
@@ -110,7 +109,6 @@ def benchmark_roofline(params):
              in itertools.product(implementations, roofline_configs, directions)]
 
     bench_suite = TestBenchmarkSuite(
-        correctness_threshold = 5e-5,
         num_warmup=100,
         num_iter=100,
         bench_batch_size=200000,
diff --git a/tests/conv_test.py b/tests/conv_test.py
@@ -27,6 +27,7 @@ def graph(self, request):
             urllib.request.urlretrieve(download_prefix + filename, temp_file.name)
             graph = load_graph(temp_file.name)
 
+        #graph = load_graph("data/1drf_radius3.5.pickle")
         return graph
 
     @pytest.fixture(params=['atomic', 'deterministic'], scope='class')

Original file line number	Diff line number	Diff line change
`@@ -178,13 +178,13 @@ class __attribute__((visibility("default"))) CUJITKernel {`
`178`	`178`	`NULL)); // includeNames`
`179`	`179`	`}`
`180`	`180`
`181`		`- void compile(string kernel_name, const vector<int> template_params) {`
	`181`	`+ void compile(string kernel_name, const vector<int> template_params, int opt_level=3) {`
`182`	`182`	`vector<string> kernel_names = {kernel_name};`
`183`	`183`	`vector<vector<int>> template_param_list = {template_params};`
`184`	`184`	`compile(kernel_names, template_param_list);`
`185`	`185`	`}`
`186`	`186`
`187`		`- void compile(vector<string> kernel_names_i, vector<vector<int>> template_param_list) {`
	`187`	`+ void compile(vector<string> kernel_names_i, vector<vector<int>> template_param_list, int opt_level=3) {`
`188`	`188`	`if(compiled) {`
`189`	`189`	`throw std::logic_error("JIT object has already been compiled!");`
`190`	`190`	`}`