AnswerDotAI
diff --git a/‎experimental/kernels/tanh/Makefile‎ ‎experimental/kernels/Makefile‎experimental/kernels/tanh/Makefile renamed to experimental/kernels/Makefile
Lines changed: 5 additions & 2 deletions b/‎experimental/kernels/tanh/Makefile‎ ‎experimental/kernels/Makefile‎experimental/kernels/tanh/Makefile renamed to experimental/kernels/Makefile
Lines changed: 5 additions & 2 deletions
diff --git a/‎experimental/kernels/kernels.h‎
Lines changed: 236 additions & 0 deletions b/‎experimental/kernels/kernels.h‎
Lines changed: 236 additions & 0 deletions
diff --git a/‎experimental/kernels/run.cpp‎
Lines changed: 109 additions & 0 deletions b/‎experimental/kernels/run.cpp‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎experimental/kernels/tanh/CMakeLists.txt‎
Lines changed: 0 additions & 22 deletions b/‎experimental/kernels/tanh/CMakeLists.txt‎
Lines changed: 0 additions & 22 deletions
@@ -1,9 +1,9 @@
 CXX=clang++
-GPUCPP ?= $(PWD)/../../..
+GPUCPP ?= $(PWD)/../..
 LIBDIR ?= $(GPUCPP)/third_party/lib
 LIBSPEC ?= . $(GPUCPP)/source
 NUM_JOBS?=$(shell nproc)
-TARGET=tanh
+TARGET=test_kernels
 ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/null 2>&1 ; echo $$?),0)
     STDLIB :=
 else
@@ -22,6 +22,9 @@ run_setup: check-python
 build/$(TARGET): run.cpp
 	mkdir -p build && $(CXX) $(FLAGS) -DNDEBUG -o ./build/$(TARGET)
 
+debug: run.cpp
+	mkdir -p build && $(CXX) $(FLAGS) -g -o ./build/$(TARGET)
+
 clean:
 	read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
 
 
@@ -0,0 +1,236 @@
+#ifndef KERNELS_H
+#define KERNELS_H
+
+#include "gpu.h"
+
+namespace gpu {
+
+
+static const char *kShaderGelu = R"(
+const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let i: u32 = GlobalInvocationID.x;
+    if (i < arrayLength(&inp)) {
+        let x: f32 = inp[i];
+        // select is more stable for larger values of x
+        out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR 
+                  * (x + .044715 * x * x * x))), x, x > 10.0);
+    }
+}
+)";
+
+
+static const char *kTanh = R"(
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let i: u32 = GlobalInvocationID.x;
+    if (i < arrayLength(&inp)) {
+        let x: f32 = inp[i];
+        out[i] = tan(x);
+    }
+}
+)";
+
+static const char *kShaderHadamard = R"(
+@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
+@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+  @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let idx = GlobalInvocationID.x;
+    if (idx < arrayLength(&A)) {
+      C[idx] = A[idx] * B[idx];
+    }
+}
+)";
+
+static const char *kShaderResidual = R"(
+@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
+@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+  @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let idx = GlobalInvocationID.x;
+    if (idx < arrayLength(&A)) {
+      C[idx] = A[idx] + B[idx];
+    }
+}
+)";
+
+/* LayerNorm
+ * v1:
+ * - No caching mean/std for backwards
+ * - No parallel reduction
+ * - Simple 1 thread for each 1..N
+ */
+// TODO(avh): Allow larger virtual 1D workgroups by making use of y / z
+// dimensions and calculating the threadID accordingly.
+static const char *kShaderLayerNorm1 = R"(
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> weight: array<{{precision}}>;
+@group(0) @binding(2) var<storage, read_write> bias: array<{{precision}}>;
+@group(0) @binding(3) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(4) var<uniform> params: Params;
+
+struct Params {
+    N: u32,
+    C: u32,
+};
+
+@compute @workgroup_size({{workgroupSize}})
+fn main(@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>,
+        @builtin(local_invocation_id) LocalInvocationID: vec3<u32>,
+        @builtin(workgroup_id) WorkgroupID: vec3<u32>) {
+    let idx: u32 = GlobalInvocationID.x;
+
+    if (idx >= params.N) { return; }
+
+    let C: u32 = params.C;
+
+    // Calculate mean
+    var sum: f32 = 0.0;
+    for (var i: u32 = 0; i < C; i = i + 1) {
+        sum += inp[idx * C + i];
+    }
+    let mean_val: f32 = sum / f32(C);
+
+    // Calculate rstd
+    sum = 0.0;
+    for (var i: u32 = 0; i < C; i = i + 1) {
+        let diff: f32 = inp[idx * C + i] - mean_val;
+        sum += diff * diff;
+    }
+    let rstd_val: f32 = 1.0 / sqrt(sum / f32(C) + 1e-5);
+
+    for (var i: u32 = 0; i < C; i = i + 1) {
+        let n: f32 = rstd_val * (inp[idx * C + i] - mean_val);
+        out[idx * C + i] = n * weight[i] + bias[i];
+    }
+}
+)";
+
+// matrix multiplication (naive implementation)
+static const char *kShaderMatMul1 = R"(
+@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
+@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let i: u32 = GlobalInvocationID.x / {{N}};
+    let j: u32 = GlobalInvocationID.x % {{N}};
+    if (i < {{M}} && j < {{N}}) {
+        var sum: f32 = 0.0;
+        for (var k: u32 = 0; k < {{K}}; k = k + 1) {
+            sum = sum + A[i * {{K}} + k] * B[k * {{N}} + j];
+        }
+        C[i * {{N}} + j] = sum;
+    }
+}
+)";
+
+static const char *kShaderMatMul2 = R"(
+@group(0) @binding(0) var<storage, read_write> A: array<f32>;
+@group(0) @binding(1) var<storage, read_write> B: array<f32>;
+@group(0) @binding(2) var<storage, read_write> C: array<f32>;
+var<workgroup> tileA: array<f32, workgroupSizeY * workgroupSizeX>;
+var<workgroup> tileB: array<f32, workgroupSizeY * workgroupSizeX>;
+@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1)
+fn matmul(
+    @builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(local_invocation_id) local_id : vec3<u32>,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>
+) {
+    let row = global_id.x;
+    let col = global_id.y;
+    if (row >= {{M}} || col >= {{N}}) {
+        return;
+    }
+    var result: f32 = 0.0;
+    for (var i = 0u; i < {{K}}; i = i + workgroupSizeX) {
+        // Load tiles into shared memory
+        tileA[local_id.y][local_id.x] = A[row][i + local_id.x];
+        tileB[local_id.y][local_id.x] = B[i + local_id.y][col];
+        // Synchronize to make sure the tile is loaded
+        workgroupBarrier();
+        // Perform partial dot product for the current tile
+        for (var k = 0u; k < workgroupSizeX; k = k + 1u) {
+            result = result + tileA[local_id.y][k] * tileB[k][local_id.x];
+        }
+        // Synchronize before loading the next tile
+        workgroupBarrier();
+    }
+    C[row][col] = result;
+}
+)";
+
+/* Generates KernelCode instance for all matmul kernels - pass in
+ * the template code via `shaderRaw`.
+ *
+ * This is intended to be run ahead of time, so is not performance critical.
+ * */
+KernelCode MatmulShader(size_t workgroupSize, const char *shaderRaw,
+                        NumType precision, size_t M, size_t K, size_t N) {
+  KernelCode shader = {shaderRaw, workgroupSize, precision};
+  replaceAll(shader.data, "{{M}}", std::to_string(M));
+  replaceAll(shader.data, "{{K}}", std::to_string(K));
+  replaceAll(shader.data, "{{N}}", std::to_string(N));
+  return shader;
+}
+
+/* Softmax
+ * v1:
+ * - equivalent to naive softmax with one thread per row
+ */
+static const char *kShaderSoftmax1 = R"(
+@group(0) @binding(0) var<storage, read_write> inp : array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out : array<{{precision}}>;
+@group(0) @binding(2) var<uniform> params : Params;
+struct Params {
+    N: u32,
+    C: u32,
+};
+const NEG_INFINITY: f32 = -3.0e38; // WGSL has problem representing -3.4028235e+38
+@compute @workgroup_size({{workgroupSize}})
+fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+    let N : u32 = params.N;
+    let C : u32 = params.C;
+    let i : u32 = global_id.x;
+    if (i < N) {
+        let inp_row_start : u32 = i * C;
+        var maxval : f32 = NEG_INFINITY;
+        // Find the maximum value in the row
+        for (var j : u32 = 0u; j < C; j++) {
+            let val : f32 = inp[inp_row_start + j];
+            if (val > maxval) {
+                maxval = val;
+            }
+        }
+        var sum : f32 = 0.0;
+        // Compute the exponentials and sum them
+        for (var j : u32 = 0u; j < C; j++) {
+            let exp_val : f32 = exp(inp[inp_row_start + j] - maxval);
+            out[inp_row_start + j] = exp_val;
+            sum += exp_val;
+        }
+        // Normalize the row to get probabilities
+        let norm : f32 = 1.0f / sum;
+        for (var j : u32 = 0u; j < C; j++) {
+            out[inp_row_start + j] /= sum;
+        }
+    }
+}
+)";
+
+} // namespace gpu
+
+#endif // KERNELS_H
@@ -0,0 +1,109 @@
+#include <array>
+#include <future>
+#include <memory>
+#include <random>
+
+#include "gpu.h"
+#include "utils/array_utils.h"
+#include "utils/logging.h"
+
+#include "llmc/reference_impls.h"
+#include "kernels.h"
+
+using namespace gpu;
+
+
+void testLayerNorm(Context &ctx) {
+  struct LNParam {
+    uint32_t N; // check
+    uint32_t C;
+  };
+  constexpr size_t N = 6;
+  constexpr size_t C = 3072;
+  std::mt19937 gen(31415);
+  std::array<float, N * C> inputArr;
+  randint(inputArr, gen, 0, 3);
+  std::array<float, N * C> outputArr;
+  std::array<float, C> weightArr;
+  std::array<float, C> biasArr;
+  Tensor input = createTensor(ctx, {N, C}, kf32, inputArr.data());
+  LNParam params = {N, C};
+  randint(weightArr, gen, 0, 5); // populate randomly
+  randint(biasArr, gen, 0, 5);
+  Tensor weight = createTensor(ctx, {C}, kf32, weightArr.data());
+  Tensor bias = createTensor(ctx, {C}, kf32, biasArr.data());
+  Tensor output = createTensor(ctx, {N, C}, kf32, outputArr.data());
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  Kernel op = createKernel(ctx, {kShaderLayerNorm1, 256, kf32},
+                           Bindings{input, weight, bias, output},
+                           /* n threads */ {N, 1, 1}, params);
+  dispatchKernel(ctx, op, promise);
+  wait(ctx, future);
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  LOG(kDefLog, kInfo, "%s",
+      show<float, N, C>(inputArr, "LayerNorm Input").c_str());
+  LOG(kDefLog, kInfo, "%s",
+      show<float, 1, C>(weightArr, "LayerNorm Weight").c_str());
+  LOG(kDefLog, kInfo, "%s",
+      show<float, 1, C>(biasArr, "LayerNorm Bias").c_str());
+  LOG(kDefLog, kInfo, "%s",
+      show<float, N, C>(outputArr, "LayerNorm Output").c_str());
+  std::array<float, N * C> refOutputArr;
+  ref::layernorm_forward_cpu(refOutputArr.data(), inputArr.data(),
+                             weightArr.data(), biasArr.data(), N, 1, C);
+  LOG(kDefLog, kInfo, "%s",
+      show<float, N, C>(refOutputArr,
+                        "LayerNorm Reference Implementation Output")
+          .c_str());
+  bool passed = isclose(outputArr.data(), refOutputArr.data(), N * C);
+  assert(passed);
+  LOG(kDefLog, kInfo, "LayerNorm passed? %d", passed);
+}
+
+void testSoftmax(Context &ctx) {
+  struct SoftmaxParam {
+    uint32_t N;
+    uint32_t C;
+  };
+  static constexpr size_t B = 6;    // batch size
+  static constexpr size_t T = 8;    // token index
+  static constexpr size_t C = 3072; // input channels
+  std::array<float, B * T * C> inputArr;
+  std::array<float, B * T * C> outputArr;
+  std::mt19937 gen(31415);
+  randint(inputArr, gen, 0, 3);
+  Tensor input = createTensor(ctx, {B * T, C}, kf32, inputArr.data());
+  Tensor output = createTensor(ctx, {B * T, C}, kf32, outputArr.data());
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  Kernel op = createKernel(
+      ctx, {kShaderSoftmax1, 256, kf32}, Bindings{input, output},
+      Shape{cdiv(B * T, 256), 1, 1}, SoftmaxParam{B * T, C});
+  dispatchKernel(ctx, op, promise);
+  wait(ctx, future);
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  LOG(kDefLog, kInfo, "%s",
+      show<float, B * T, C>(inputArr, "Softmax Input").c_str());
+  LOG(kDefLog, kInfo, "%s",
+      show<float, B * T, C>(outputArr, "Softmax Output").c_str());
+  std::array<float, B * T * C> refOutputArr;
+  ref::softmax_forward_cpu(refOutputArr.data(), inputArr.data(), B * T, C);
+  LOG(kDefLog, kInfo, "%s",
+      show<float, B * T, C>(refOutputArr, "Softmax reference Output").c_str());
+  LOG(kDefLog, kInfo, "number of elements: %d", B * T * C);
+  bool passed = isclose(outputArr.data(), refOutputArr.data(), B * T * C);
+  assert(passed);
+  LOG(kDefLog, kInfo, "Softmax passed? %d", passed);
+}
+
+
+int main(int argc, char **argv) {
+  Context ctx = createContext();
+  testLayerNorm(ctx);
+  testSoftmax(ctx);
+
+  LOG(kDefLog, kInfo, "Done with all tests");
+}
+
+