remove old matmul implementations

austinvhuang · austinvhuang · commit 16a7b4bb0f20 · 2024-08-22T12:41:49.000-04:00
diff --git a/experimental/kernels/kernels.h b/experimental/kernels/kernels.h
@@ -118,75 +118,6 @@ fn main(@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>,
 }
 )";
 
-// matrix multiplication (naive implementation)
-static const char *kShaderMatMul1 = R"(
-@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
-@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
-@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
-@compute @workgroup_size({{workgroupSize}})
-fn main(
-    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
-    let i: u32 = GlobalInvocationID.x / {{N}};
-    let j: u32 = GlobalInvocationID.x % {{N}};
-    if (i < {{M}} && j < {{N}}) {
-        var sum: f32 = 0.0;
-        for (var k: u32 = 0; k < {{K}}; k = k + 1) {
-            sum = sum + A[i * {{K}} + k] * B[k * {{N}} + j];
-        }
-        C[i * {{N}} + j] = sum;
-    }
-}
-)";
-
-static const char *kShaderMatMul2 = R"(
-@group(0) @binding(0) var<storage, read_write> A: array<f32>;
-@group(0) @binding(1) var<storage, read_write> B: array<f32>;
-@group(0) @binding(2) var<storage, read_write> C: array<f32>;
-var<workgroup> tileA: array<f32, workgroupSizeY * workgroupSizeX>;
-var<workgroup> tileB: array<f32, workgroupSizeY * workgroupSizeX>;
-@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1)
-fn matmul(
-    @builtin(global_invocation_id) global_id : vec3<u32>,
-    @builtin(local_invocation_id) local_id : vec3<u32>,
-    @builtin(workgroup_id) workgroup_id : vec3<u32>
-) {
-    let row = global_id.x;
-    let col = global_id.y;
-    if (row >= {{M}} || col >= {{N}}) {
-        return;
-    }
-    var result: f32 = 0.0;
-    for (var i = 0u; i < {{K}}; i = i + workgroupSizeX) {
-        // Load tiles into shared memory
-        tileA[local_id.y][local_id.x] = A[row][i + local_id.x];
-        tileB[local_id.y][local_id.x] = B[i + local_id.y][col];
-        // Synchronize to make sure the tile is loaded
-        workgroupBarrier();
-        // Perform partial dot product for the current tile
-        for (var k = 0u; k < workgroupSizeX; k = k + 1u) {
-            result = result + tileA[local_id.y][k] * tileB[k][local_id.x];
-        }
-        // Synchronize before loading the next tile
-        workgroupBarrier();
-    }
-    C[row][col] = result;
-}
-)";
-
-/* Generates KernelCode instance for all matmul kernels - pass in
- * the template code via `shaderRaw`.
- *
- * This is intended to be run ahead of time, so is not performance critical.
- * */
-KernelCode MatmulShader(size_t workgroupSize, const char *shaderRaw,
-                        NumType precision, size_t M, size_t K, size_t N) {
-  KernelCode shader = {shaderRaw, workgroupSize, precision};
-  replaceAll(shader.data, "{{M}}", std::to_string(M));
-  replaceAll(shader.data, "{{K}}", std::to_string(K));
-  replaceAll(shader.data, "{{N}}", std::to_string(N));
-  return shader;
-}
-
 /* Softmax
  * v1:
  * - equivalent to naive softmax with one thread per row