Changes for Camera-Ready Version of Paper (#97)

vbharadwaj-bk · web-flow · commit 73aa9b6e3728 · 2025-05-09T14:11:33.000-07:00
* UVU and UVW plots updated.

* Benchmarked against fused cuE kernel for convolution test.

* Added double backward benchmark and plotting.

* Modified the double backward plots.

* Minor changes to double backward x-labels.

* Updated the README and citation.

* Proceeding with final MACE benchmrking.

* Ready to wrap up camera-ready.

* Updated double backward plot.

* Forced unsafe atomic add on AMD HIP to boost performance.

* Updated message about camera-ready copy.
diff --git a/.gitignore b/.gitignore
@@ -33,6 +33,7 @@ scratch.txt
 triton_autotuning
 paper_benchmarks
 paper_benchmarks_v2
+paper_benchmarks_v3
 openequivariance/extlib/*.so
 
 get_node.sh
diff --git a/README.md b/README.md
@@ -29,7 +29,9 @@ We currently support NVIDIA GPUs and just added beta support on AMD GPUs for
 all tensor products! See [the coverage table](#tensor-products-we-accelerate) for more 
 details.
 
-**Warning**: This is an early release, bug reports are welcome.
+📣 📣 OpenEquivariance was accepted to the 2025 SIAM Conference on Applied and 
+Computational Discrete Algorithms (Proceedings Track)! Catch the talk in 
+Montréal and check out the [camera-ready copy on Arxiv](https://arxiv.org/abs/2501.13986) (available May 12, 2025).
 
 ## Show me some examples
 Here's a CG tensor product implemented by e3nn: 
@@ -279,14 +281,12 @@ If you have a use case for any of the unsupported features above, let us know.
 If you find this code useful, please cite our paper:
 
 ```bibtex
-@misc{openequivariance,
-      title={An Efficient Sparse Kernel Generator for O(3)-Equivariant Deep Networks}, 
-      author={Vivek Bharadwaj and Austin Glover and Aydin Buluc and James Demmel},
-      year={2025},
-      eprint={2501.13986},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG},
-      url={https://arxiv.org/abs/2501.13986}, 
+@inbook{openequivariance,
+author={Vivek Bharadwaj and Austin Glover and Aydin Buluc and James Demmel},
+title={An Efficient Sparse Kernel Generator for O(3)-Equivariant Deep Networks}, 
+booktitle = {SIAM Conference on Applied and Computational Discrete Algorithms (ACDA25)},
+chapter = {},
+url={https://arxiv.org/abs/2501.13986}
 }
 ```
 
diff --git a/openequivariance/benchmark/benchmark_routines/paper_benchmark_uvw.py b/openequivariance/benchmark/benchmark_routines/paper_benchmark_uvw.py
@@ -2,15 +2,18 @@
 import numpy as np
 
 from openequivariance.benchmark.logging_utils import getLogger
-from openequivariance.implementations.E3NNTensorProduct import E3NNTensorProductCompiledCUDAGraphs
+from openequivariance.implementations.E3NNTensorProduct import E3NNTensorProduct, E3NNTensorProductCompiledCUDAGraphs
 from openequivariance.implementations.CUETensorProduct import CUETensorProduct
 from openequivariance.implementations.TensorProduct import TensorProduct
 from openequivariance.benchmark.TestBenchmarkSuite import TestBenchmarkSuite, TestDefinition, Direction
 from openequivariance.benchmark.tpp_creation_utils import FullyConnectedTPProblem
 from openequivariance.benchmark.benchmark_configs import e3nn_torch_tetris_polynomial, diffdock_configs
 
 logger = getLogger()
+import torch
+from torch._functorch import config
 
+@config.patch("donated_buffer", False)
 def run_paper_uvw_benchmark(params) -> pathlib.Path:
     FCTPP = FullyConnectedTPProblem
 
@@ -27,16 +30,15 @@ def run_paper_uvw_benchmark(params) -> pathlib.Path:
     problems += float64_problems
 
     implementations = [
-        #E3NNTensorProductCompiledCUDAGraphs,
-        #CUETensorProduct,
+        E3NNTensorProductCompiledCUDAGraphs,
+        CUETensorProduct,
         TensorProduct]
 
-    tests = [TestDefinition(implementation, problem, direction, correctness=True, benchmark=True) 
+    tests = [TestDefinition(implementation, problem, direction, correctness=False, benchmark=True) 
                 for problem, direction, implementation
                 in itertools.product(problems, params.directions, implementations)]
 
     bench_suite = TestBenchmarkSuite(
-            correctness_threshold = 5e-5,
             num_warmup=100,
             num_iter=100,
             bench_batch_size=params.batch_size,
diff --git a/openequivariance/benchmark/plotting/__init__.py b/openequivariance/benchmark/plotting/__init__.py
@@ -2,4 +2,5 @@
 from openequivariance.benchmark.plotting.plot_uvu import plot_uvu
 from openequivariance.benchmark.plotting.plot_uvw import plot_uvw
 from openequivariance.benchmark.plotting.plot_roofline import plot_roofline
-from openequivariance.benchmark.plotting.plot_convolution import plot_convolution
+from openequivariance.benchmark.plotting.plot_convolution import plot_convolution
+from openequivariance.benchmark.plotting.plot_double_backward import plot_double_backward
diff --git a/openequivariance/benchmark/plotting/plot_convolution.py b/openequivariance/benchmark/plotting/plot_convolution.py
@@ -8,6 +8,7 @@ def plot_convolution(data_folder):
     benchmarks, metadata = load_benchmarks(data_folder)
 
     implementations =  ["CUEConvolution", 
+                        "CUEConvolutionFused",
                         "LoopUnrollConvScatterSum",
                         "LoopUnrollConvAtomic",
                         "LoopUnrollConvDeterministic"
diff --git a/openequivariance/benchmark/plotting/plot_double_backward.py b/openequivariance/benchmark/plotting/plot_double_backward.py
@@ -0,0 +1,88 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os, json, pathlib, sys
+from openequivariance.benchmark.plotting import *
+
+def plot_double_backward(data_folder):
+    data_folder = pathlib.Path(data_folder)
+    benchmarks, metadata = load_benchmarks(data_folder)
+
+    configs = metadata["config_labels"]
+    implementations = ["E3NNTensorProduct", "CUETensorProduct", "LoopUnrollTP"]
+
+    def calculate_tp_per_sec(exp):
+        return exp["benchmark results"]["batch_size"] / (np.mean(exp["benchmark results"]["time_millis"]) * 0.001)
+
+    dataf32 = {"double_backward": {}}
+    for i, desc in enumerate(configs):
+        for direction in ["double_backward"]:
+            dataf32[direction][desc] = {}
+            for impl in implementations:
+                f32_benches = [b for b in benchmarks if b["benchmark results"]["rep_dtype"] == "<class 'numpy.float32'>"]
+                exp = filter(f32_benches, {"config_label": desc, 
+                                        "direction": direction, 
+                                        "implementation_name": impl
+                                        }, match_one=True)
+                dataf32[direction][desc][labelmap[impl]] = calculate_tp_per_sec(exp)
+
+    dataf64 = {"double_backward": {}}
+    for i, desc in enumerate(configs):
+        for direction in ["double_backward"]:
+            dataf64[direction][desc] = {}
+            for impl in implementations:
+                f64_benches = [b for b in benchmarks if 'float64' in b["benchmark results"]["rep_dtype"]]
+
+                exp = filter(f64_benches, {"config_label": desc, 
+                                        "direction": direction, 
+                                        "implementation_name": impl
+                                        }, match_one=True)
+
+                if exp is None:
+                    print(desc)
+                    print(direction)
+                    print(impl)
+
+                dataf64[direction][desc][labelmap[impl]] = calculate_tp_per_sec(exp)
+
+    fig = plt.figure(figsize=(7, 3))
+    gs = fig.add_gridspec(1, 2, hspace=0, wspace=0.1)
+    axs = gs.subplots(sharex='col', sharey='row')
+
+    grouped_barchart(dataf32["double_backward"], axs[0], bar_height_fontsize=0, colormap=colormap, group_spacing=6.0)
+    grouped_barchart(dataf64["double_backward"], axs[1], bar_height_fontsize=0, colormap=colormap, group_spacing=6.0)
+
+    for i in range(2):
+        set_grid(axs[i])
+        set_grid(axs[i])
+
+    axs[0].set_xlabel("float32")
+    axs[1].set_xlabel("float64")
+
+    handles, labels = axs[0].get_legend_handles_labels()
+    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
+    axs[0].legend(*zip(*unique))
+
+    for ax in fig.get_axes():
+        ax.label_outer()
+
+    fig.supylabel("2nd Deriv. Throughput\n(# tensor products / s)", y=0.5)
+
+    speedup_table = []
+    for direction in ['double_backward']:
+        for impl in ['e3nn', 'cuE']:
+            for dtype_label, dtype_set in [('f32', dataf32), ('f64', dataf64)]:
+                speedups = [measurement['ours'] / measurement[impl] for _, measurement in dtype_set[direction].items() if impl in measurement]
+                stats = np.min(speedups), np.mean(speedups), np.median(speedups), np.max(speedups)
+                stats = [f"{stat:.2f}" for stat in stats]
+
+                dir_print = direction
+                result = [dir_print, impl, dtype_label] + stats
+                speedup_table.append(result)
+
+    print('\t\t'.join(['Direction', 'Base', 'dtype', 'min', 'mean', 'med', 'max']))
+    for row in speedup_table:
+        print('\t\t'.join(row))
+
+    fig.show()
+    fig.tight_layout()
+    fig.savefig(str(data_folder / "double_backward_throughput.pdf"), bbox_inches='tight')
diff --git a/openequivariance/benchmark/plotting/plotting_utils.py b/openequivariance/benchmark/plotting/plotting_utils.py
@@ -296,7 +296,9 @@ def set_size(w, h, ax=None):
 
 labelmap = {"E3NNTensorProduct": "e3nn", "CUETensorProduct": "cuE", "LoopUnrollTP": "ours",
             "E3NNTensorProductCompiledCUDAGraphs": "e3nn",
-            "LoopUnrollConvScatterSum": "fast-scattersum", "CUEConvolution": "cuE-scattersum", 
+            "LoopUnrollConvScatterSum": "fast-scattersum", 
+            "CUEConvolution": "cuE-scattersum", 
+            "CUEConvolutionFused": "cuE-fused", 
             "LoopUnrollConvDeterministic": "fast-fused-det", "LoopUnrollConvAtomic": "fast-fused-atomic"
            }
 colormap = {"e3nn": "lightblue", "cuE": "orange", "ours": "g"}
@@ -305,7 +307,8 @@ def set_size(w, h, ax=None):
     colormap[key] = colormap["ours"]
     
 colormap["cuE-scattersum"] = colormap["cuE"]
-hatchmap = {"fast-fused-det": "oo", "fast-fused-atomic": "//"}
+colormap["cuE-fused"] = colormap["cuE"]
+hatchmap = {"fast-fused-det": "oo", "fast-fused-atomic": "//", "cuE-fused": "//"}
 
 directions = ["forward", "backward"]
 dtypes = ["<class 'numpy.float32'>", "<class 'numpy.float64'>"]
diff --git a/openequivariance/extlib/__init__.py b/openequivariance/extlib/__init__.py
@@ -42,6 +42,7 @@
         def postprocess(kernel):
             kernel = kernel.replace("__syncwarp();", "__threadfence_block();")
             kernel = kernel.replace("__shfl_down_sync(FULL_MASK,", "__shfl_down(")
+            kernel = kernel.replace("atomicAdd", "unsafeAtomicAdd")
             return kernel 
         postprocess_kernel = postprocess
 
diff --git a/openequivariance/implementations/CUETensorProduct.py b/openequivariance/implementations/CUETensorProduct.py
@@ -171,7 +171,9 @@ def analyze_trace(self, trace_file):
                 event_time_ms = event["dur"] / 1000
                 total += event_time_ms 
 
-                if "TensorProductUniform1dKernel" in event["name"]:
+                if "TensorProductUniform1dKernel" in event["name"] \
+                        or "channelwise_kernel_fwd" in event["name"] \
+                        or "channelwise_kernel_bwd" in event["name"]:
                     tp_time += event_time_ms 
 
         return tp_time 
@@ -210,7 +212,7 @@ def benchmark_forward(
                     with record_function("cue_forward"):
                         torch_L3_out = self.forward(torch_L1_in, torch_L2_in, torch_weights) 
 
-                prof.export_chrome_trace(trace_file)
+                prof.export_chrome_trace(trace_file) 
                 time_millis[i] = self.analyze_trace(trace_file)
 
             return time_millis
diff --git a/openequivariance/implementations/convolution/CUEConv.py b/openequivariance/implementations/convolution/CUEConv.py
@@ -1,5 +1,6 @@
 import numpy as np
 import numpy.linalg as la
+import itertools
 
 from openequivariance.implementations.CUETensorProduct import CUETensorProduct
 from openequivariance.implementations.convolution.ConvolutionBase import *
@@ -25,3 +26,65 @@ def forward(self, L1_in, L2_in, weights, rows, cols):
     @staticmethod
     def name():
         return "CUEConvolution"
+
+class CUEConvFused(ConvolutionBase):
+    def __init__(self, config, idx_dtype=np.int64, torch_op=True):
+        super().__init__(config, idx_dtype, torch_op)
+
+        global torch
+        import torch
+        import e3nn.o3 as o3
+
+        np_to_torch_dtype = {
+            np.float32: torch.float32,
+            np.float64: torch.float64
+        }
+
+        import cuequivariance as cue
+        import cuequivariance_torch as cuet 
+        from cuequivariance_torch.primitives.tensor_product import TensorProductUniform4x1dIndexed
+
+        class O3_e3nn(cue.O3):
+            def __mul__(  # pylint: disable=no-self-argument
+                rep1: "O3_e3nn", rep2: "O3_e3nn"
+            ) -> Iterator["O3_e3nn"]:
+                return [O3_e3nn(l=ir.l, p=ir.p) for ir in cue.O3.__mul__(rep1, rep2)]
+
+            @classmethod
+            def clebsch_gordan(
+                cls, rep1: "O3_e3nn", rep2: "O3_e3nn", rep3: "O3_e3nn"
+            ) -> np.ndarray:
+                rep1, rep2, rep3 = cls._from(rep1), cls._from(rep2), cls._from(rep3)
+
+                if rep1.p * rep2.p == rep3.p:
+                    return o3.wigner_3j(rep1.l, rep2.l, rep3.l).numpy()[None] * np.sqrt(
+                        rep3.dim
+                    )
+                return np.zeros((0, rep1.dim, rep2.dim, rep3.dim))
+
+            def __lt__(  # pylint: disable=no-self-argument
+                rep1: "O3_e3nn", rep2: "O3_e3nn"
+            ) -> bool:
+                rep2 = rep1._from(rep2)
+                return (rep1.l, rep1.p) < (rep2.l, rep2.p)
+
+            @classmethod
+            def iterator(cls) -> Iterator["O3_e3nn"]:
+                for l in itertools.count(0):
+                    yield O3_e3nn(l=l, p=1 * (-1) ** l)
+                    yield O3_e3nn(l=l, p=-1 * (-1) ** l)
+
+        descriptor = (cue.descriptors.channelwise_tensor_product(
+                cue.Irreps(O3_e3nn, str(config.irreps_in1)),
+                cue.Irreps(O3_e3nn, str(config.irreps_in2)),
+                cue.Irreps(O3_e3nn, str(config.irreps_out))
+            ).squeeze_modes().flatten_coefficient_modes())
+
+        self.tp = TensorProductUniform4x1dIndexed(descriptor.polynomial.operations[0][1], 'cuda', math_dtype=np_to_torch_dtype[config.irrep_dtype])
+
+    def forward(self, L1_in, L2_in, weights, rows, cols):
+        return self.tp(weights, L1_in, L2_in, None, rows, None, cols, L1_in.shape[0])
+
+    @staticmethod
+    def name():
+        return "CUEConvolutionFused"
diff --git a/openequivariance/implementations/convolution/LoopUnrollConv.py b/openequivariance/implementations/convolution/LoopUnrollConv.py
@@ -145,6 +145,9 @@ def generate_double_backward_schedule(warps_per_block):
                  "is_uvw": int(self.is_uvw)})
         logger.info("Kernel compiled!")
 
+        #with open("scratch.txt", "w") as f:
+        #    f.write(self.jit_kernel)
+
         self.reorder_weights_e3nn_to_oeq = lambda input, output, has_batch_dim: \
                 self.forward_schedule.reorder_weights(input, output, "forward", has_batch_dim) 
         self.reorder_weights_oeq_to_e3nn = lambda input, output, has_batch_dim: \
diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -160,6 +160,7 @@ def benchmark_convolution(params):
 
         implementations = [ TensorProductConvScatterSum, 
                             CUEConv,
+                            CUEConvFused,
                             TensorProductConvDeterministic, 
                             TensorProductConvAtomic]
 
@@ -185,7 +186,37 @@ def benchmark_convolution(params):
             plot({"data_folder": output_folder})
         else:
             logger.critical("Cannot plot convolution speedups over cuE with --limited-memory flag enabled.")
- 
+
+def run_paper_hderiv_benchmark(params):
+    from openequivariance.benchmark.benchmark_configs import mace_nequip_problems, diffdock_configs
+
+    implementations = [
+        E3NNTensorProduct,
+        CUETensorProduct,
+        TensorProduct, 
+    ]
+
+    problems = diffdock_configs + mace_nequip_problems
+    float64_problems = copy.deepcopy(problems)
+
+    for problem in float64_problems: 
+        problem.irrep_dtype = np.float64
+        problem.weight_dtype = np.float64 
+
+    directions : list[Direction] = ['double_backward']
+    tests = [TestDefinition(implementation, problem, direction, correctness=False, benchmark=True) 
+                for  problem, direction, implementation in itertools.product(problems + float64_problems, directions, implementations)]
+
+    logger = getLogger()
+    logger.setLevel(logging.INFO)
+    test_suite = TestBenchmarkSuite(bench_batch_size=20000, test_name="double_backward")
+    test_suite.run(tests)
+
+    data_folder = test_suite.run(tests, params.output_folder)
+    if params.plot:
+        plot({"data_folder": data_folder})
+
+
 def plot(params):
     import openequivariance.benchmark.plotting as plotting
     data_folder, test_name = None, None
@@ -206,6 +237,10 @@ def plot(params):
         plotting.plot_roofline(data_folder)
     elif test_name == "convolution":
         plotting.plot_convolution(data_folder)
+    elif test_name == "double_backward":
+        plotting.plot_double_backward(data_folder)
+    else:
+        raise ValueError(f"Unknown test name: {test_name}. Cannot plot results.")
 
 if __name__=='__main__':
     logger.setLevel(logging.INFO)
@@ -253,6 +288,10 @@ def plot(params):
     parser_uvw.add_argument("--plot", action="store_true", help="Plot the results.")
     parser_uvw.set_defaults(func=run_paper_uvw_benchmark)
 
+    parser_higher_deriv = subparsers.add_parser('double_backward', help='Run the higher derivative kernel benchmark')
+    parser_higher_deriv.add_argument("--batch_size", "-b", type=int, default=50000, help="Batch size for benchmark")
+    parser_higher_deriv.set_defaults(func=run_paper_hderiv_benchmark)
+
     parser_plot = subparsers.add_parser('plot', help="Generate a plot for a folder of benchmarks.")
     parser_plot.add_argument("data_folder", type=str)
     parser_plot.set_defaults(func=plot)
diff --git a/tests/double_backwards_driver.py b/tests/double_backwards_driver.py
diff --git a/tests/mace_driver.py b/tests/mace_driver.py