AMP Autocast Registration (#150)

vbharadwaj-bk · Austin Glover · web-flow · commit 371e2d37149e · 2025-07-28T14:24:50.000-07:00
* python autocast opt out

* add irrep_dtype

* loop tp changes

* LoopUnrollChanges

* test commit

* remove test comment

* put autocast registration behind guard

* Preparing Austin's commit for autocast merge.

* Linted.

---------

Co-authored-by: Austin Glover &lt;austin_glover@berkeley.com&gt;
diff --git a/openequivariance/extension/libtorch_tp_jit.cpp b/openequivariance/extension/libtorch_tp_jit.cpp
@@ -342,7 +342,7 @@ class TorchJITConv : public torch::CustomClassHolder {
     Map_t fwd_dict, bwd_dict, dbl_bwd_dict, kernel_dims;
     JITConvImpl<JITKernel> internal;
     KernelProp kernelProp;
-    int64_t L3_dim;
+    int64_t L3_dim, irrep_dtype;
 
     TorchJITConv(string kernel_plaintext, Map_t fwd_dict_i, Map_t bwd_dict_i, Map_t dbl_bwd_dict_i, Map_t kernel_dims_i) :
         fwd_dict(fwd_dict_i.copy()),
@@ -356,7 +356,8 @@ class TorchJITConv : public torch::CustomClassHolder {
                 to_map(kernel_dims_i)
             ),
         kernelProp(kernel_dims, true),
-        L3_dim(kernelProp.L3_dim)
+        L3_dim(kernelProp.L3_dim),
+        irrep_dtype(kernel_dims_i.at("irrep_dtype"))
         { }
 
     tuple<tuple<string, string>, 
@@ -676,6 +677,7 @@ TORCH_LIBRARY_FRAGMENT(libtorch_tp_jit, m) {
             return 0;
         })
         .def_readonly("L3_dim", &TorchJITConv::L3_dim)
+        .def_readonly("irrep_dtype", &TorchJITConv::irrep_dtype)
         .def("__eq__", [](const c10::IValue & self, const c10::IValue& other) -> bool {
             return self.is(other); 
         })
diff --git a/openequivariance/implementations/LoopUnrollTP.py b/openequivariance/implementations/LoopUnrollTP.py
@@ -232,6 +232,21 @@ def double_backward(ctx, E, F, G):
             setup_context=setup_context_double_backward,
         )
 
+    @classmethod
+    def register_autocast(cls):
+        global torch
+        import torch
+
+        torch.library.register_autocast(
+            "libtorch_tp_jit::jit_tp_forward", "cuda", torch.float32
+        )
+        torch.library.register_autocast(
+            "libtorch_tp_jit::jit_tp_backward", "cuda", torch.float32
+        )
+        torch.library.register_autocast(
+            "libtorch_tp_jit::jit_tp_double_backward", "cuda", torch.float32
+        )
+
     @staticmethod
     def name():
         return "LoopUnrollTP"
@@ -290,3 +305,4 @@ def calculate_flops_backward(self, batch_size: int) -> dict:
 if extlib.TORCH_COMPILE:
     LoopUnrollTP.register_torch_fakes()
     LoopUnrollTP.register_autograd()
+    LoopUnrollTP.register_autocast()
diff --git a/openequivariance/implementations/convolution/LoopUnrollConv.py b/openequivariance/implementations/convolution/LoopUnrollConv.py
@@ -6,7 +6,10 @@
     SMEMCapacityException,
 )
 
-from openequivariance.implementations.dtype_enum import dtype_to_enum
+from openequivariance.implementations.dtype_enum import (
+    dtype_to_enum,
+    enum_to_torch_dtype,
+)
 from openequivariance.templates.jinja_utils import get_jinja_environment
 from openequivariance import extlib
 from openequivariance.extlib import JITConvImpl, postprocess_kernel, DeviceProp
@@ -297,20 +300,49 @@ def double_backward_rawptrs(*args, **kwargs):
         def fake_forward(
             jit, L1_in, L2_in, W, rows, cols, workspace_buffer, sender_perm
         ):
-            L3_dim = None
+            L3_dim, irrep_dtype = None, None
             if hasattr(jit, "wrapped_obj"):
                 L3_dim = jit.wrapped_obj.kernel_dims["L3_dim"]
+                irrep_dtype = jit.wrapped_obj.kernel_dims["irrep_dtype"]
             else:
                 L3_dim = jit.L3_dim
+                irrep_dtype = jit.irrep_dtype
 
-            return L1_in.new_empty(L1_in.shape[0], L3_dim)
+            return torch.empty(
+                L1_in.shape[0],
+                L3_dim,
+                device="cuda",
+                dtype=enum_to_torch_dtype[irrep_dtype],
+            )
 
         @torch.library.register_fake("libtorch_tp_jit::jit_conv_backward")
         def fake_backward(
             jit, L1_in, L2_in, W, L3_grad, rows, cols, workspace_buffer, sender_perm
         ):
             return torch.empty_like(L1_in), torch.empty_like(L2_in), torch.empty_like(W)
 
+        @torch.library.register_fake("libtorch_tp_jit::jit_conv_double_backward")
+        def fake_double_backward(
+            jit,
+            L1_in,
+            L2_in,
+            W,
+            L3_grad,
+            L1_dgrad,
+            L2_dgrad,
+            w_dgrad,
+            rows,
+            cols,
+            workspace_buffer,
+            transpose_perm=None,
+        ):
+            return [
+                L1_in.new_empty(*L1_in.shape),
+                L2_in.new_empty(*L2_in.shape),
+                W.new_empty(*W.shape),
+                L3_grad.new_empty(*L3_grad.shape),
+            ]
+
     @classmethod
     def register_autograd(cls):
         backward_op = torch.ops.libtorch_tp_jit.jit_conv_backward
@@ -393,7 +425,23 @@ def double_backward(ctx, E, F, G):
             setup_context=setup_context_double_backward,
         )
 
+    @classmethod
+    def register_autocast(cls):
+        global torch
+        import torch
+
+        torch.library.register_autocast(
+            "libtorch_tp_jit::jit_conv_forward", "cuda", torch.float32
+        )
+        torch.library.register_autocast(
+            "libtorch_tp_jit::jit_conv_backward", "cuda", torch.float32
+        )
+        torch.library.register_autocast(
+            "libtorch_tp_jit::jit_conv_double_backward", "cuda", torch.float32
+        )
+
 
 if extlib.TORCH_COMPILE:
     LoopUnrollConv.register_torch_fakes()
     LoopUnrollConv.register_autograd()
+    LoopUnrollConv.register_autocast()