Skip to content

Commit e87c2fe

Browse files
committed
create skeleton for kernel library + llm.c ports, move deprecated experimental code to experimental/legacy/
1 parent 4241606 commit e87c2fe

18 files changed

Lines changed: 351 additions & 75 deletions

File tree

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
CXX=clang++
2-
GPUCPP ?= $(PWD)/../../..
2+
GPUCPP ?= $(PWD)/../..
33
LIBDIR ?= $(GPUCPP)/third_party/lib
44
LIBSPEC ?= . $(GPUCPP)/source
55
NUM_JOBS?=$(shell nproc)
6-
TARGET=tanh
6+
TARGET=test_kernels
77
ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/null 2>&1 ; echo $$?),0)
88
STDLIB :=
99
else
@@ -22,6 +22,9 @@ run_setup: check-python
2222
build/$(TARGET): run.cpp
2323
mkdir -p build && $(CXX) $(FLAGS) -DNDEBUG -o ./build/$(TARGET)
2424

25+
debug: run.cpp
26+
mkdir -p build && $(CXX) $(FLAGS) -g -o ./build/$(TARGET)
27+
2528
clean:
2629
read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
2730

experimental/kernels/kernels.h

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
#ifndef KERNELS_H
2+
#define KERNELS_H
3+
4+
#include "gpu.h"
5+
6+
namespace gpu {
7+
8+
9+
static const char *kShaderGelu = R"(
10+
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
11+
@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
12+
@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
13+
@compute @workgroup_size({{workgroupSize}})
14+
fn main(
15+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
16+
let i: u32 = GlobalInvocationID.x;
17+
if (i < arrayLength(&inp)) {
18+
let x: f32 = inp[i];
19+
// select is more stable for larger values of x
20+
out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR
21+
* (x + .044715 * x * x * x))), x, x > 10.0);
22+
}
23+
}
24+
)";
25+
26+
27+
static const char *kTanh = R"(
28+
@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
29+
@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
30+
@compute @workgroup_size({{workgroupSize}})
31+
fn main(
32+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
33+
let i: u32 = GlobalInvocationID.x;
34+
if (i < arrayLength(&inp)) {
35+
let x: f32 = inp[i];
36+
out[i] = tan(x);
37+
}
38+
}
39+
)";
40+
41+
static const char *kShaderHadamard = R"(
42+
@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
43+
@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
44+
@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
45+
@compute @workgroup_size({{workgroupSize}})
46+
fn main(
47+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
48+
let idx = GlobalInvocationID.x;
49+
if (idx < arrayLength(&A)) {
50+
C[idx] = A[idx] * B[idx];
51+
}
52+
}
53+
)";
54+
55+
static const char *kShaderResidual = R"(
56+
@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
57+
@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
58+
@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
59+
@compute @workgroup_size({{workgroupSize}})
60+
fn main(
61+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
62+
let idx = GlobalInvocationID.x;
63+
if (idx < arrayLength(&A)) {
64+
C[idx] = A[idx] + B[idx];
65+
}
66+
}
67+
)";
68+
69+
/* LayerNorm
70+
* v1:
71+
* - No caching mean/std for backwards
72+
* - No parallel reduction
73+
* - Simple 1 thread for each 1..N
74+
*/
75+
// TODO(avh): Allow larger virtual 1D workgroups by making use of y / z
76+
// dimensions and calculating the threadID accordingly.
77+
static const char *kShaderLayerNorm1 = R"(
78+
@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
79+
@group(0) @binding(1) var<storage, read_write> weight: array<{{precision}}>;
80+
@group(0) @binding(2) var<storage, read_write> bias: array<{{precision}}>;
81+
@group(0) @binding(3) var<storage, read_write> out: array<{{precision}}>;
82+
@group(0) @binding(4) var<uniform> params: Params;
83+
84+
struct Params {
85+
N: u32,
86+
C: u32,
87+
};
88+
89+
@compute @workgroup_size({{workgroupSize}})
90+
fn main(@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>,
91+
@builtin(local_invocation_id) LocalInvocationID: vec3<u32>,
92+
@builtin(workgroup_id) WorkgroupID: vec3<u32>) {
93+
let idx: u32 = GlobalInvocationID.x;
94+
95+
if (idx >= params.N) { return; }
96+
97+
let C: u32 = params.C;
98+
99+
// Calculate mean
100+
var sum: f32 = 0.0;
101+
for (var i: u32 = 0; i < C; i = i + 1) {
102+
sum += inp[idx * C + i];
103+
}
104+
let mean_val: f32 = sum / f32(C);
105+
106+
// Calculate rstd
107+
sum = 0.0;
108+
for (var i: u32 = 0; i < C; i = i + 1) {
109+
let diff: f32 = inp[idx * C + i] - mean_val;
110+
sum += diff * diff;
111+
}
112+
let rstd_val: f32 = 1.0 / sqrt(sum / f32(C) + 1e-5);
113+
114+
for (var i: u32 = 0; i < C; i = i + 1) {
115+
let n: f32 = rstd_val * (inp[idx * C + i] - mean_val);
116+
out[idx * C + i] = n * weight[i] + bias[i];
117+
}
118+
}
119+
)";
120+
121+
// matrix multiplication (naive implementation)
122+
static const char *kShaderMatMul1 = R"(
123+
@group(0) @binding(0) var<storage, read_write> A: array<{{precision}}>;
124+
@group(0) @binding(1) var<storage, read_write> B: array<{{precision}}>;
125+
@group(0) @binding(2) var<storage, read_write> C: array<{{precision}}>;
126+
@compute @workgroup_size({{workgroupSize}})
127+
fn main(
128+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
129+
let i: u32 = GlobalInvocationID.x / {{N}};
130+
let j: u32 = GlobalInvocationID.x % {{N}};
131+
if (i < {{M}} && j < {{N}}) {
132+
var sum: f32 = 0.0;
133+
for (var k: u32 = 0; k < {{K}}; k = k + 1) {
134+
sum = sum + A[i * {{K}} + k] * B[k * {{N}} + j];
135+
}
136+
C[i * {{N}} + j] = sum;
137+
}
138+
}
139+
)";
140+
141+
static const char *kShaderMatMul2 = R"(
142+
@group(0) @binding(0) var<storage, read_write> A: array<f32>;
143+
@group(0) @binding(1) var<storage, read_write> B: array<f32>;
144+
@group(0) @binding(2) var<storage, read_write> C: array<f32>;
145+
var<workgroup> tileA: array<f32, workgroupSizeY * workgroupSizeX>;
146+
var<workgroup> tileB: array<f32, workgroupSizeY * workgroupSizeX>;
147+
@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1)
148+
fn matmul(
149+
@builtin(global_invocation_id) global_id : vec3<u32>,
150+
@builtin(local_invocation_id) local_id : vec3<u32>,
151+
@builtin(workgroup_id) workgroup_id : vec3<u32>
152+
) {
153+
let row = global_id.x;
154+
let col = global_id.y;
155+
if (row >= {{M}} || col >= {{N}}) {
156+
return;
157+
}
158+
var result: f32 = 0.0;
159+
for (var i = 0u; i < {{K}}; i = i + workgroupSizeX) {
160+
// Load tiles into shared memory
161+
tileA[local_id.y][local_id.x] = A[row][i + local_id.x];
162+
tileB[local_id.y][local_id.x] = B[i + local_id.y][col];
163+
// Synchronize to make sure the tile is loaded
164+
workgroupBarrier();
165+
// Perform partial dot product for the current tile
166+
for (var k = 0u; k < workgroupSizeX; k = k + 1u) {
167+
result = result + tileA[local_id.y][k] * tileB[k][local_id.x];
168+
}
169+
// Synchronize before loading the next tile
170+
workgroupBarrier();
171+
}
172+
C[row][col] = result;
173+
}
174+
)";
175+
176+
/* Generates KernelCode instance for all matmul kernels - pass in
177+
* the template code via `shaderRaw`.
178+
*
179+
* This is intended to be run ahead of time, so is not performance critical.
180+
* */
181+
KernelCode MatmulShader(size_t workgroupSize, const char *shaderRaw,
182+
NumType precision, size_t M, size_t K, size_t N) {
183+
KernelCode shader = {shaderRaw, workgroupSize, precision};
184+
replaceAll(shader.data, "{{M}}", std::to_string(M));
185+
replaceAll(shader.data, "{{K}}", std::to_string(K));
186+
replaceAll(shader.data, "{{N}}", std::to_string(N));
187+
return shader;
188+
}
189+
190+
/* Softmax
191+
* v1:
192+
* - equivalent to naive softmax with one thread per row
193+
*/
194+
static const char *kShaderSoftmax1 = R"(
195+
@group(0) @binding(0) var<storage, read_write> inp : array<{{precision}}>;
196+
@group(0) @binding(1) var<storage, read_write> out : array<{{precision}}>;
197+
@group(0) @binding(2) var<uniform> params : Params;
198+
struct Params {
199+
N: u32,
200+
C: u32,
201+
};
202+
const NEG_INFINITY: f32 = -3.0e38; // WGSL has problem representing -3.4028235e+38
203+
@compute @workgroup_size({{workgroupSize}})
204+
fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
205+
let N : u32 = params.N;
206+
let C : u32 = params.C;
207+
let i : u32 = global_id.x;
208+
if (i < N) {
209+
let inp_row_start : u32 = i * C;
210+
var maxval : f32 = NEG_INFINITY;
211+
// Find the maximum value in the row
212+
for (var j : u32 = 0u; j < C; j++) {
213+
let val : f32 = inp[inp_row_start + j];
214+
if (val > maxval) {
215+
maxval = val;
216+
}
217+
}
218+
var sum : f32 = 0.0;
219+
// Compute the exponentials and sum them
220+
for (var j : u32 = 0u; j < C; j++) {
221+
let exp_val : f32 = exp(inp[inp_row_start + j] - maxval);
222+
out[inp_row_start + j] = exp_val;
223+
sum += exp_val;
224+
}
225+
// Normalize the row to get probabilities
226+
let norm : f32 = 1.0f / sum;
227+
for (var j : u32 = 0u; j < C; j++) {
228+
out[inp_row_start + j] /= sum;
229+
}
230+
}
231+
}
232+
)";
233+
234+
} // namespace gpu
235+
236+
#endif // KERNELS_H

experimental/kernels/run.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#include <array>
2+
#include <future>
3+
#include <memory>
4+
#include <random>
5+
6+
#include "gpu.h"
7+
#include "utils/array_utils.h"
8+
#include "utils/logging.h"
9+
10+
#include "llmc/reference_impls.h"
11+
#include "kernels.h"
12+
13+
using namespace gpu;
14+
15+
16+
void testLayerNorm(Context &ctx) {
17+
struct LNParam {
18+
uint32_t N; // check
19+
uint32_t C;
20+
};
21+
constexpr size_t N = 6;
22+
constexpr size_t C = 3072;
23+
std::mt19937 gen(31415);
24+
std::array<float, N * C> inputArr;
25+
randint(inputArr, gen, 0, 3);
26+
std::array<float, N * C> outputArr;
27+
std::array<float, C> weightArr;
28+
std::array<float, C> biasArr;
29+
Tensor input = createTensor(ctx, {N, C}, kf32, inputArr.data());
30+
LNParam params = {N, C};
31+
randint(weightArr, gen, 0, 5); // populate randomly
32+
randint(biasArr, gen, 0, 5);
33+
Tensor weight = createTensor(ctx, {C}, kf32, weightArr.data());
34+
Tensor bias = createTensor(ctx, {C}, kf32, biasArr.data());
35+
Tensor output = createTensor(ctx, {N, C}, kf32, outputArr.data());
36+
std::promise<void> promise;
37+
std::future<void> future = promise.get_future();
38+
Kernel op = createKernel(ctx, {kShaderLayerNorm1, 256, kf32},
39+
Bindings{input, weight, bias, output},
40+
/* n threads */ {N, 1, 1}, params);
41+
dispatchKernel(ctx, op, promise);
42+
wait(ctx, future);
43+
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
44+
LOG(kDefLog, kInfo, "%s",
45+
show<float, N, C>(inputArr, "LayerNorm Input").c_str());
46+
LOG(kDefLog, kInfo, "%s",
47+
show<float, 1, C>(weightArr, "LayerNorm Weight").c_str());
48+
LOG(kDefLog, kInfo, "%s",
49+
show<float, 1, C>(biasArr, "LayerNorm Bias").c_str());
50+
LOG(kDefLog, kInfo, "%s",
51+
show<float, N, C>(outputArr, "LayerNorm Output").c_str());
52+
std::array<float, N * C> refOutputArr;
53+
ref::layernorm_forward_cpu(refOutputArr.data(), inputArr.data(),
54+
weightArr.data(), biasArr.data(), N, 1, C);
55+
LOG(kDefLog, kInfo, "%s",
56+
show<float, N, C>(refOutputArr,
57+
"LayerNorm Reference Implementation Output")
58+
.c_str());
59+
bool passed = isclose(outputArr.data(), refOutputArr.data(), N * C);
60+
assert(passed);
61+
LOG(kDefLog, kInfo, "LayerNorm passed? %d", passed);
62+
}
63+
64+
void testSoftmax(Context &ctx) {
65+
struct SoftmaxParam {
66+
uint32_t N;
67+
uint32_t C;
68+
};
69+
static constexpr size_t B = 6; // batch size
70+
static constexpr size_t T = 8; // token index
71+
static constexpr size_t C = 3072; // input channels
72+
std::array<float, B * T * C> inputArr;
73+
std::array<float, B * T * C> outputArr;
74+
std::mt19937 gen(31415);
75+
randint(inputArr, gen, 0, 3);
76+
Tensor input = createTensor(ctx, {B * T, C}, kf32, inputArr.data());
77+
Tensor output = createTensor(ctx, {B * T, C}, kf32, outputArr.data());
78+
std::promise<void> promise;
79+
std::future<void> future = promise.get_future();
80+
Kernel op = createKernel(
81+
ctx, {kShaderSoftmax1, 256, kf32}, Bindings{input, output},
82+
Shape{cdiv(B * T, 256), 1, 1}, SoftmaxParam{B * T, C});
83+
dispatchKernel(ctx, op, promise);
84+
wait(ctx, future);
85+
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
86+
LOG(kDefLog, kInfo, "%s",
87+
show<float, B * T, C>(inputArr, "Softmax Input").c_str());
88+
LOG(kDefLog, kInfo, "%s",
89+
show<float, B * T, C>(outputArr, "Softmax Output").c_str());
90+
std::array<float, B * T * C> refOutputArr;
91+
ref::softmax_forward_cpu(refOutputArr.data(), inputArr.data(), B * T, C);
92+
LOG(kDefLog, kInfo, "%s",
93+
show<float, B * T, C>(refOutputArr, "Softmax reference Output").c_str());
94+
LOG(kDefLog, kInfo, "number of elements: %d", B * T * C);
95+
bool passed = isclose(outputArr.data(), refOutputArr.data(), B * T * C);
96+
assert(passed);
97+
LOG(kDefLog, kInfo, "Softmax passed? %d", passed);
98+
}
99+
100+
101+
int main(int argc, char **argv) {
102+
Context ctx = createContext();
103+
testLayerNorm(ctx);
104+
testSoftmax(ctx);
105+
106+
LOG(kDefLog, kInfo, "Done with all tests");
107+
}
108+
109+

experimental/kernels/tanh/CMakeLists.txt

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)