PASSIONLab
diff --git a/‎openequivariance/benchmark/TestBenchmarkSuite.py‎
Lines changed: 3 additions & 3 deletions b/‎openequivariance/benchmark/TestBenchmarkSuite.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎openequivariance/benchmark/benchmark_utils.py‎
Lines changed: 9 additions & 13 deletions b/‎openequivariance/benchmark/benchmark_utils.py‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎openequivariance/benchmark/perf_metrics_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎openequivariance/benchmark/perf_metrics_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎openequivariance/implementations/CUETensorProduct.py‎
Lines changed: 28 additions & 90 deletions b/‎openequivariance/implementations/CUETensorProduct.py‎
Lines changed: 28 additions & 90 deletions
diff --git a/‎openequivariance/implementations/ComputationSchedule.py‎
Lines changed: 2 additions & 3 deletions b/‎openequivariance/implementations/ComputationSchedule.py‎
Lines changed: 2 additions & 3 deletions
@@ -184,7 +184,7 @@ def run(
                         num_warmup=self.num_warmup,
                         num_iter=self.num_iter,
                         prng_seed=self.prng_seed,
-                        torch_op=self.torch_op,
+                        with_torch_overhead=self.torch_op,
                     )
 
             if test.direction == "backward":
@@ -207,7 +207,7 @@ def run(
                         num_warmup=self.num_warmup,
                         num_iter=self.num_iter,
                         prng_seed=self.prng_seed,
-                        torch_op=self.torch_op,
+                        with_torch_overhead=self.torch_op,
                     )
 
             if test.direction == "double_backward":
@@ -230,7 +230,7 @@ def run(
                         num_warmup=self.num_warmup,
                         num_iter=self.num_iter,
                         prng_seed=self.prng_seed,
-                        torch_op=self.torch_op,
+                        with_torch_overhead=self.torch_op,
                     )
 
             fname = pathlib.Path(f"{output_folder}/{test_ID}_{impl.name()}.json")
 
@@ -70,7 +70,7 @@ def benchmark_forward(
     num_warmup: int,
     num_iter: int,
     prng_seed: int,
-    torch_op: bool,
+    with_torch_overhead: bool,
 ) -> dict:
     """
     This function sets up the necessary materials and calls the internal benchmarker
@@ -89,7 +89,7 @@ def benchmark_forward(
         weights = weights[np.newaxis, :]
 
     logger.info("Initialized input / output data.")
-    tp = implementation(problem, torch_op=torch_op)
+    tp = implementation(problem)
 
     # BENCHMARK
     try:
@@ -100,6 +100,7 @@ def benchmark_forward(
             L2_in=L2_in,
             weights=weights,
             L3_buffer=L3_buffer,
+            with_torch_overhead=with_torch_overhead,
         )
     except NotImplementedError:
         logger.warning(
@@ -145,7 +146,7 @@ def benchmark_backward(
     num_warmup: int,
     num_iter: int,
     prng_seed: int,
-    torch_op: bool,
+    with_torch_overhead: bool,
 ) -> dict:
     result = {
         "tp_direction": "backward",
@@ -161,7 +162,7 @@ def benchmark_backward(
         weights = weights[np.newaxis, :]
 
     logger.info("Initialized input / output data.")
-    tp = implementation(problem, torch_op=torch_op)
+    tp = implementation(problem)
 
     try:
         time_millis = tp.benchmark_backward(
@@ -171,9 +172,7 @@ def benchmark_backward(
             L2_in=in2,
             L3_buffer=out_grad,
             weights=weights,
-            L1_grad=in1_grad,
-            L2_grad=in2_grad,
-            weights_grad=weights_grad,
+            with_torch_overhead=with_torch_overhead,
         )
     except NotImplementedError:
         logger.warning(
@@ -223,7 +222,7 @@ def benchmark_double_backward(
     num_warmup: int,
     num_iter: int,
     prng_seed: int,
-    torch_op: bool,
+    with_torch_overhead: bool,
 ) -> dict:
     result = {
         "tp_direction": "double_backward",
@@ -240,20 +239,17 @@ def benchmark_double_backward(
         weights = weights[np.newaxis, :]
 
     logger.info("Initialized input / output data.")
-    tp = implementation(problem, torch_op=torch_op)
+    tp = implementation(problem)
 
     try:
         time_millis = tp.benchmark_double_backward(
             num_warmup=num_warmup,
             num_iter=num_iter,
             L1_in=in1,
             L2_in=in2,
-            L3_buffer=out_grad,
             weights=weights,
-            L1_grad=in1_grad,
-            L2_grad=in2_grad,
             weights_grad=weights_grad,
-            L3_double_grad=out_double_grad,
+            with_torch_overhead=with_torch_overhead,
         )
     except NotImplementedError:
         logger.warning(
 
@@ -4,8 +4,8 @@
     count_cg_non_zero,
     sparse_outer_product_work,
 )
-from openequivariance.implementations.TensorProductBase import TensorProductBase
-from openequivariance.implementations.e3nn_lite import TPProblem
+
+from openequivariance.implementations.e3nn_lite import TPProblem, wigner_3j
 from openequivariance.benchmark.logging_utils import getLogger
 import numpy as np
 
@@ -70,7 +70,7 @@ def calculate_minimum_flops_forward(tpp: TPProblem, batch_size: int) -> dict:
         )
 
         flops_count["outer_products"] += sparse_outer_product_work(
-            TensorProductBase.load_cg_tensor(l1, l2, l3)
+            wigner_3j(l1, l2, l3)
         )
         flops_count["CG_decomposition"] += count_cg_non_zero(l1, l2, l3) * (
             ins.path_shape[0] * ins.path_shape[1]
 
@@ -1,5 +1,4 @@
 import numpy as np
-import tempfile
 import json
 import os
 import itertools
@@ -13,7 +12,6 @@
     FullyConnectedTPProblem,
     SingleInstruction,
 )
-from openequivariance.extlib import GPUTimer
 from openequivariance.implementations.utils import count_cg_non_zero
 
 os.environ["CUEQUIVARIANCE_OPS_USE_JIT"] = "1"
@@ -123,6 +121,12 @@ def iterator(cls) -> Iterator["O3_e3nn"]:
             self.tp_correctness.to("cuda")
             self.forward_correctness = lambda x, y, W: self.tp_correctness(W, x, y)
 
+        self.kernel_names = [
+            "TensorProductUniform1dKernel",
+            "channelwise_kernel_fwd",
+            "channelwise_kernel_bwd",
+        ]
+
     def forward_cpu(
         self,
         L1_in: np.ndarray,
@@ -197,42 +201,18 @@ def benchmark_forward(
         L2_in: np.ndarray,
         L3_buffer: np.ndarray,
         weights: np.ndarray,
+        with_torch_overhead: bool = True,
     ) -> np.ndarray:
-        """
-        When we don't want to include torch overhead, we use the Pytorch profiler
-        to extract the device time that the kernel takes.
-        """
-        if self.torch_op:
-            return super().benchmark_forward(
-                num_warmup, num_iter, L1_in, L2_in, L3_buffer, weights
-            )
-        else:
-            from torch.profiler import profile, record_function, ProfilerActivity
-
-            time_millis = np.zeros(num_iter, dtype=np.float32)
-            torch_L1_in = torch.tensor(L1_in).to(device="cuda").detach()
-            torch_L2_in = torch.tensor(L2_in).to(device="cuda").detach()
-            torch_weights = torch.tensor(weights).to(device="cuda").detach()
-
-            timer = GPUTimer()
-
-            for i in range(num_warmup):
-                self.forward(torch_L1_in, torch_L2_in, torch_weights)
-
-            trace_file = tempfile.NamedTemporaryFile().name
-
-            for i in range(num_iter):
-                timer.clear_L2_cache()
-                with profile(
-                    activities=[ProfilerActivity.CUDA], record_shapes=True
-                ) as prof:
-                    with record_function("cue_forward"):
-                        self.forward(torch_L1_in, torch_L2_in, torch_weights)
-
-                prof.export_chrome_trace(trace_file)
-                time_millis[i] = self.analyze_trace(trace_file)
-
-            return time_millis
+        return super().benchmark_forward(
+            num_warmup,
+            num_iter,
+            L1_in,
+            L2_in,
+            L3_buffer,
+            weights,
+            with_torch_overhead,
+            kernel_names=["TensorProductUniform1DKernel", "channelwise_kernel_"],
+        )
 
     def benchmark_backward(
         self,
@@ -242,60 +222,18 @@ def benchmark_backward(
         L2_in: np.ndarray,
         L3_buffer: np.ndarray,
         weights: np.ndarray,
-        L1_grad: np.ndarray,
-        L2_grad: np.ndarray,
-        weights_grad: np.ndarray,
+        with_torch_overhead: bool = True,
     ) -> np.ndarray:
-        if self.torch_op:
-            return super().benchmark_backward(
-                num_warmup,
-                num_iter,
-                L1_in,
-                L2_in,
-                L3_buffer,
-                weights,
-                L1_grad,
-                L2_grad,
-                weights_grad,
-            )
-        else:
-            from torch.profiler import profile, record_function, ProfilerActivity
-
-            time_millis = np.zeros(num_iter, dtype=np.float32)
-
-            torch_L1_in = torch.tensor(L1_in, requires_grad=True, device="cuda")
-            torch_L2_in = torch.tensor(L2_in, requires_grad=True, device="cuda")
-            torch_weights = torch.tensor(weights, requires_grad=True, device="cuda")
-            torch_out = self.forward(torch_L1_in, torch_L2_in, torch_weights)
-            torch_L3_grad_in = torch.tensor(L3_buffer, device="cuda")
-
-            timer = GPUTimer()
-
-            for i in range(num_warmup):
-                torch_out.backward(
-                    gradient=torch_L3_grad_in,
-                    retain_graph=True,
-                    inputs=[torch_L1_in, torch_L2_in, torch_weights],
-                )
-
-            trace_file = tempfile.NamedTemporaryFile().name
-
-            for i in range(num_iter):
-                timer.clear_L2_cache()
-                with profile(
-                    activities=[ProfilerActivity.CUDA], record_shapes=True
-                ) as prof:
-                    with record_function("cue_backward"):
-                        torch_out.backward(
-                            gradient=torch_L3_grad_in,
-                            retain_graph=True,
-                            inputs=[torch_L1_in, torch_L2_in, torch_weights],
-                        )
-
-                prof.export_chrome_trace(trace_file)
-                time_millis[i] = self.analyze_trace(trace_file)
-
-            return time_millis
+        return super().benchmark_backward(
+            num_warmup,
+            num_iter,
+            L1_in,
+            L2_in,
+            L3_buffer,
+            weights,
+            with_torch_overhead,
+            kernel_names=self.kernel_names,
+        )
 
     # Copied over from loop unroller to match arithmetic intensity on roofline plots
     def calculate_flops_forward(self, batch_size: int) -> dict:
 
@@ -1,8 +1,7 @@
 import numpy as np
-from openequivariance.implementations.e3nn_lite import Irreps, TPProblem
+from openequivariance.implementations.e3nn_lite import Irreps, TPProblem, wigner_3j
 from itertools import accumulate
 from openequivariance.benchmark.logging_utils import getLogger
-from openequivariance.implementations.TensorProductBase import TensorProductBase
 
 logger = getLogger()
 
@@ -60,7 +59,7 @@ class CGTensor:
     def __init__(self, l1, l2, l3, normalization_factor, dtype):
         suffix_map = {np.float32: "f", np.float64: "L"}
 
-        tensor = TensorProductBase.load_cg_tensor(l1, l2, l3)
+        tensor = wigner_3j(l1, l2, l3)
         coord1, coord2, coord3 = [
             arr.astype(np.int32).copy() for arr in np.nonzero(tensor)
         ]
Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,8 @@`
`4`	`4`	`count_cg_non_zero,`
`5`	`5`	`sparse_outer_product_work,`
`6`	`6`	`)`
`7`		`-from openequivariance.implementations.TensorProductBase import TensorProductBase`
`8`		`-from openequivariance.implementations.e3nn_lite import TPProblem`
	`7`	`+`
	`8`	`+from openequivariance.implementations.e3nn_lite import TPProblem, wigner_3j`
`9`	`9`	`from openequivariance.benchmark.logging_utils import getLogger`
`10`	`10`	`import numpy as np`
`11`	`11`
`@@ -70,7 +70,7 @@ def calculate_minimum_flops_forward(tpp: TPProblem, batch_size: int) -> dict:`
`70`	`70`	`)`
`71`	`71`
`72`	`72`	`flops_count["outer_products"] += sparse_outer_product_work(`
`73`		`- TensorProductBase.load_cg_tensor(l1, l2, l3)`
	`73`	`+ wigner_3j(l1, l2, l3)`
`74`	`74`	`)`
`75`	`75`	`flops_count["CG_decomposition"] += count_cg_non_zero(l1, l2, l3) * (`
`76`	`76`	`ins.path_shape[0] * ins.path_shape[1]`