Zero'd buffer.

vbharadwaj-bk · vbharadwaj-bk · commit 136c9f6bc81b · 2025-11-30T20:44:29.000-08:00
diff --git a/openequivariance_extjax/src/libjax_tp_jit.cpp b/openequivariance_extjax/src/libjax_tp_jit.cpp
@@ -11,6 +11,9 @@
 #include "nanobind/nanobind.h"
 #include "xla/ffi/api/ffi.h"
 
+namespace nb = nanobind;
+namespace ffi = xla::ffi;
+
 #define CUDA_BACKEND // Stick to CUDA for now 
 
 #ifdef CUDA_BACKEND
@@ -20,14 +23,11 @@
     using GPU_Allocator = CUDA_Allocator;
 
     template<typename T>
-    using GroupMM = GroupMMCUDA<T>; 
+    using GroupMM = GroupMMCUDA<T>;
 #endif
 
 #include "tensorproducts.hpp"
 
-namespace nb = nanobind;
-namespace ffi = xla::ffi;
-
 xla::ffi::DataType enum_to_xla_dtype(int64_t i){
     switch(i) {
         case 1:
@@ -45,22 +45,48 @@ xla::ffi::DataType enum_to_xla_dtype(int64_t i){
 }
 
 inline void* data_ptr(ffi::AnyBuffer &buffer) {
-    if(buffer.element_type() == xla::ffi::DataType::F32) 
-        return reinterpret_cast<void*>(buffer.typed_data<float>());
-    else if(buffer.element_type() == xla::ffi::DataType::F64)
-        return reinterpret_cast<void*>(buffer.typed_data<double>());
-    else if(buffer.element_type() == xla::ffi::DataType::S64) 
-        return reinterpret_cast<void*>(buffer.typed_data<int64_t>());
-    else if(buffer.element_type() == xla::ffi::DataType::U8) 
-        return reinterpret_cast<void*>(buffer.typed_data<uint8_t>());
-    else
-        throw logic_error("Unsupported tensor datatype!");
+    switch (buffer.element_type()) {
+        case xla::ffi::DataType::F32:
+            return reinterpret_cast<void*>(buffer.typed_data<float>());
+        case xla::ffi::DataType::F64:
+            return reinterpret_cast<void*>(buffer.typed_data<double>());
+        case xla::ffi::DataType::S64:
+            return reinterpret_cast<void*>(buffer.typed_data<int64_t>());
+        case xla::ffi::DataType::U8:
+            return reinterpret_cast<void*>(buffer.typed_data<uint8_t>());
+        default:
+            throw logic_error("Unsupported tensor datatype!");
+    }
+}
+
+inline int byte_count(ffi::AnyBuffer &buffer) {
+    switch (buffer.element_type()) {
+        case xla::ffi::DataType::F32:
+            return 4;
+        case xla::ffi::DataType::F64:
+            return 8;
+        case xla::ffi::DataType::S64:
+            return 8;
+        case xla::ffi::DataType::U8:
+            return 1;
+        default:
+            throw logic_error("Unsupported tensor datatype!");
+    }
 }
 
 inline void* data_ptr(ffi::Result<ffi::AnyBuffer> &buffer) {
     return data_ptr(*buffer);
 }
 
+#ifdef CUDA_BACKEND
+void zero_buffer(ffi::AnyBuffer &buffer) {
+    cudaMemset(
+        data_ptr(buffer), 
+        0, 
+        buffer.element_count() * byte_count(buffer));
+}
+#endif
+
 struct KernelProp {
     int64_t L1_dim, L2_dim, L3_dim, weight_numel;
     bool shared_weights;
@@ -244,7 +270,7 @@ ffi::Error tp_backward_impl(
     }
 
     if (k.shared_weights) {
-        // Need to zero out W_grad
+        zero_buffer(*W_grad);
     } 
 
     jit_kernel->backward(