Skip to content

Commit ca80110

Browse files
Increase the limit of buffer_size for matmul
1 parent bdba854 commit ca80110

3 files changed

Lines changed: 73 additions & 17 deletions

File tree

experimental/kernels/kernels.h

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -282,17 +282,31 @@ fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
282282
let T : u32 = params.T;
283283
let C : u32 = params.C;
284284
let OC : u32 = params.OC;
285+
// N == B*T == global_id.x
285286
let b : u32 = global_id.x / T;
286287
let t : u32 = global_id.x % T;
287-
if (b < B && t < T) {
288-
let bt : u32 = b * T + t;
289-
for (var o : u32 = 0u; o < OC; o++) {
290-
var val : {{precision}} = bias[o];
291-
for (var i : u32 = 0u; i < C; i++) {
292-
val += inp[bt * C + i] * weight[o * C + i];
293-
}
294-
out[bt * OC + o] = val;
295-
}
288+
if (arrayLength(&bias) == 1) {
289+
if (b < B && t < T) {
290+
let bt : u32 = global_id.x;
291+
for (var o : u32 = 0u; o < OC; o++) {
292+
var val : {{precision}} = 0;
293+
for (var i : u32 = 0u; i < C; i++) {
294+
val += inp[bt * C + i] * weight[o * C + i];
295+
}
296+
out[bt * OC + o] = val;
297+
}
298+
}
299+
} else {
300+
if (b < B && t < T) {
301+
let bt : u32 = global_id.x;
302+
for (var o : u32 = 0u; o < OC; o++) {
303+
var val : {{precision}} = bias[o];
304+
for (var i : u32 = 0u; i < C; i++) {
305+
val += inp[bt * C + i] * weight[o * C + i];
306+
}
307+
out[bt * OC + o] = val;
308+
}
309+
}
296310
}
297311
}
298312
)";

experimental/kernels/kernels_c.cpp

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -171,15 +171,57 @@ void MATMUL_FORWARD_GPU(float* out,
171171
unsigned long c = static_cast<unsigned long>(C);
172172
unsigned long oc = static_cast<unsigned long>(OC);
173173
setLogLevel(kError);
174-
Context ctx = createContext();
175-
Tensor inp_t = createTensor(ctx, Shape{b * t * c}, kf32, inp);
176-
Tensor weight_t = createTensor(ctx, Shape{oc * c}, kf32, weight);
177-
Tensor bias_t = createTensor(ctx, Shape{oc}, kf32, bias);
178-
Tensor out_t = createTensor(ctx, Shape{b * t * oc}, kf32);
174+
// See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
175+
WGPURequiredLimits requiredLimits = {
176+
.limits = {
177+
.maxTextureDimension1D=8192,
178+
.maxTextureDimension2D=8192,
179+
.maxTextureDimension3D=2048,
180+
.maxTextureArrayLayers=256,
181+
.maxBindGroups=4,
182+
.maxBindGroupsPlusVertexBuffers=24,
183+
.maxBindingsPerBindGroup=1000,
184+
.maxDynamicUniformBuffersPerPipelineLayout=8,
185+
.maxDynamicStorageBuffersPerPipelineLayout=4,
186+
.maxSampledTexturesPerShaderStage=16,
187+
.maxSamplersPerShaderStage=16,
188+
.maxStorageBuffersPerShaderStage=8,
189+
.maxStorageTexturesPerShaderStage=4,
190+
.maxUniformBuffersPerShaderStage=12,
191+
.maxUniformBufferBindingSize=65536,
192+
.maxStorageBufferBindingSize=1073741824,
193+
.minUniformBufferOffsetAlignment=256,
194+
.minStorageBufferOffsetAlignment=256,
195+
.maxVertexBuffers=8,
196+
.maxBufferSize=0x80000000,
197+
.maxVertexAttributes=16,
198+
.maxVertexBufferArrayStride=2048,
199+
.maxInterStageShaderComponents=64,
200+
.maxInterStageShaderVariables=16,
201+
.maxColorAttachments=8,
202+
.maxColorAttachmentBytesPerSample=32,
203+
.maxComputeWorkgroupStorageSize=16384,
204+
.maxComputeInvocationsPerWorkgroup=256,
205+
.maxComputeWorkgroupSizeX=256,
206+
.maxComputeWorkgroupSizeY=256,
207+
.maxComputeWorkgroupSizeZ=64,
208+
.maxComputeWorkgroupsPerDimension=65535
209+
},
210+
.nextInChain = nullptr
211+
};
212+
Context ctx = createContext({},{},{
213+
.requiredLimits = &requiredLimits
214+
});
215+
216+
Tensor inp_i = createTensor(ctx, Shape{b * t * c}, kf32, inp);
217+
Tensor weight_i = createTensor(ctx, Shape{oc * c}, kf32, weight);
218+
Tensor bias_i = bias == NULL ? createTensor(ctx, Shape{1}, kf32) : createTensor(ctx, Shape{oc}, kf32, bias);
219+
Tensor out_o = createTensor(ctx, Shape{b * t * oc}, kf32);
179220
std::promise<void> promise;
180221
std::future<void> future = promise.get_future();
222+
assert ( (b*t) % 256 == 0 );
181223
Kernel op = createKernel(ctx, {kShaderMatmul, 256, kf32},
182-
Bindings{inp_t, weight_t, bias_t, out_t},
224+
Bindings{inp_i, weight_i, bias_i, out_o},
183225
/* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
184226
/* params */
185227
MatmulParams{
@@ -190,7 +232,7 @@ void MATMUL_FORWARD_GPU(float* out,
190232
});
191233
dispatchKernel(ctx, op, promise);
192234
wait(ctx, future);
193-
toCPU(ctx, out_t, out, b * t * oc * sizeof(float));
235+
toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
194236
}
195237

196238
void MATMUL_BACKWARD_GPU(float* dinp, float* dweight, float* dbias,

experimental/kernels/kernels_c.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ extern "C" {
1010
#define USE_GPU_FOR_LAYERNORM_FORWARD 1
1111
// -- Note: atomicAdd should be used with i32 or u32 not f32.
1212
// #define USE_GPU_FOR_LAYERNORM_BACKWARD 1
13-
// #define USE_GPU_FOR_MATMUL_FORWARD 1
13+
#define USE_GPU_FOR_MATMUL_FORWARD 1
1414
// #define USE_GPU_FOR_MATMUL_BACKWARD 1
1515
#define USE_GPU_FOR_ATTENTION_FORWARD 1
1616
// #define USE_GPU_FOR_ATTENTION_BACKWARD 1

0 commit comments

Comments
 (0)