We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents be0971a + 353fd1f commit 8b0f487Copy full SHA for 8b0f487
2 files changed
experimental/kernels/kernels.h
@@ -677,7 +677,7 @@ fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
677
for (var i : u32 = 0u; i < V; i++) {
678
let p : {{precision}} = probs[probs_bt + i];
679
let indicator : {{precision}} = select(0.0, 1.0, i == ix);
680
- atomicAdd(&dlogits[dlogits_bt + i], (p - indicator) * dloss);
+ dlogits[dlogits_bt + i] += (p - indicator) * dloss;
681
}
682
683
experimental/kernels/unittest_llmc/unittest_kernels.h
@@ -20,7 +20,7 @@ extern "C" {
20
#define USE_GPU_FOR_RESIDUAL_BACKWARD 1
21
#define USE_GPU_FOR_SOFTMAX_FORWARD 1
22
#define USE_GPU_FOR_CROSSENTROPY_FORWARD 1
23
-// #define USE_GPU_FOR_CROSSENTROPY_SOFTMAX_BACKWARD 1
+#define USE_GPU_FOR_CROSSENTROPY_SOFTMAX_BACKWARD 1
24
25
26
#ifdef USE_GPU_FOR_ENCODER_FORWARD
0 commit comments