Skip to content

Commit 05eede8

Browse files
Increase the limit of buffer_size for encoder
1 parent ca80110 commit 05eede8

2 files changed

Lines changed: 53 additions & 43 deletions

File tree

experimental/kernels/kernels_c.cpp

Lines changed: 52 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,45 @@ using namespace gpu; // createContext, createTensor, createKernel,
1212

1313
#define VOCAB_SIZE 50257
1414

15+
// See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
16+
#define LIMITS_BUFFER_SIZE_1GB { \
17+
.limits = { \
18+
.maxTextureDimension1D=8192, \
19+
.maxTextureDimension2D=8192, \
20+
.maxTextureDimension3D=2048, \
21+
.maxTextureArrayLayers=256, \
22+
.maxBindGroups=4, \
23+
.maxBindGroupsPlusVertexBuffers=24, \
24+
.maxBindingsPerBindGroup=1000, \
25+
.maxDynamicUniformBuffersPerPipelineLayout=8, \
26+
.maxDynamicStorageBuffersPerPipelineLayout=4, \
27+
.maxSampledTexturesPerShaderStage=16, \
28+
.maxSamplersPerShaderStage=16, \
29+
.maxStorageBuffersPerShaderStage=8, \
30+
.maxStorageTexturesPerShaderStage=4, \
31+
.maxUniformBuffersPerShaderStage=12, \
32+
.maxUniformBufferBindingSize=65536, \
33+
.maxStorageBufferBindingSize=1073741824, \
34+
.minUniformBufferOffsetAlignment=256, \
35+
.minStorageBufferOffsetAlignment=256, \
36+
.maxVertexBuffers=8, \
37+
.maxBufferSize=0x80000000, \
38+
.maxVertexAttributes=16, \
39+
.maxVertexBufferArrayStride=2048, \
40+
.maxInterStageShaderComponents=64, \
41+
.maxInterStageShaderVariables=16, \
42+
.maxColorAttachments=8, \
43+
.maxColorAttachmentBytesPerSample=32, \
44+
.maxComputeWorkgroupStorageSize=16384, \
45+
.maxComputeInvocationsPerWorkgroup=256, \
46+
.maxComputeWorkgroupSizeX=256, \
47+
.maxComputeWorkgroupSizeY=256, \
48+
.maxComputeWorkgroupSizeZ=64, \
49+
.maxComputeWorkgroupsPerDimension=65535 \
50+
}, \
51+
.nextInChain = nullptr \
52+
}
53+
1554
void ENCODER_FORWARD_GPU(float* out,
1655
int* inp, float* wte, float* wpe,
1756
int B, int T, int C){
@@ -25,7 +64,10 @@ void ENCODER_FORWARD_GPU(float* out,
2564
uint32_t C;
2665
};
2766
setLogLevel(kError);
28-
Context ctx = createContext();
67+
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
68+
Context ctx = createContext({},{},{
69+
.requiredLimits = &requiredLimits
70+
});
2971
Tensor input = createTensor(ctx, Shape{b * t}, ki32, inp);
3072
Tensor wte_t = createTensor(ctx, Shape{v, c}, kf32, wte);
3173
Tensor wpe_t = createTensor(ctx, Shape{t, c}, kf32, wpe);
@@ -59,7 +101,10 @@ void ENCODER_BACKWARD_GPU(float* dwte, float* dwpe,
59101
uint32_t C;
60102
};
61103
setLogLevel(kError);
62-
Context ctx = createContext();
104+
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
105+
Context ctx = createContext({},{},{
106+
.requiredLimits = &requiredLimits
107+
});
63108
Tensor dwte_t = createTensor(ctx, Shape{v, c}, kf32, dwte);
64109
Tensor dwpe_t = createTensor(ctx, Shape{t, c}, kf32, dwpe);
65110
Tensor dout_t = createTensor(ctx, Shape{b * t * c}, kf32, dout);
@@ -171,44 +216,7 @@ void MATMUL_FORWARD_GPU(float* out,
171216
unsigned long c = static_cast<unsigned long>(C);
172217
unsigned long oc = static_cast<unsigned long>(OC);
173218
setLogLevel(kError);
174-
// See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
175-
WGPURequiredLimits requiredLimits = {
176-
.limits = {
177-
.maxTextureDimension1D=8192,
178-
.maxTextureDimension2D=8192,
179-
.maxTextureDimension3D=2048,
180-
.maxTextureArrayLayers=256,
181-
.maxBindGroups=4,
182-
.maxBindGroupsPlusVertexBuffers=24,
183-
.maxBindingsPerBindGroup=1000,
184-
.maxDynamicUniformBuffersPerPipelineLayout=8,
185-
.maxDynamicStorageBuffersPerPipelineLayout=4,
186-
.maxSampledTexturesPerShaderStage=16,
187-
.maxSamplersPerShaderStage=16,
188-
.maxStorageBuffersPerShaderStage=8,
189-
.maxStorageTexturesPerShaderStage=4,
190-
.maxUniformBuffersPerShaderStage=12,
191-
.maxUniformBufferBindingSize=65536,
192-
.maxStorageBufferBindingSize=1073741824,
193-
.minUniformBufferOffsetAlignment=256,
194-
.minStorageBufferOffsetAlignment=256,
195-
.maxVertexBuffers=8,
196-
.maxBufferSize=0x80000000,
197-
.maxVertexAttributes=16,
198-
.maxVertexBufferArrayStride=2048,
199-
.maxInterStageShaderComponents=64,
200-
.maxInterStageShaderVariables=16,
201-
.maxColorAttachments=8,
202-
.maxColorAttachmentBytesPerSample=32,
203-
.maxComputeWorkgroupStorageSize=16384,
204-
.maxComputeInvocationsPerWorkgroup=256,
205-
.maxComputeWorkgroupSizeX=256,
206-
.maxComputeWorkgroupSizeY=256,
207-
.maxComputeWorkgroupSizeZ=64,
208-
.maxComputeWorkgroupsPerDimension=65535
209-
},
210-
.nextInChain = nullptr
211-
};
219+
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
212220
Context ctx = createContext({},{},{
213221
.requiredLimits = &requiredLimits
214222
});
@@ -249,7 +257,10 @@ void MATMUL_BACKWARD_GPU(float* dinp, float* dweight, float* dbias,
249257
unsigned long c = static_cast<unsigned long>(C);
250258
unsigned long oc = static_cast<unsigned long>(OC);
251259
setLogLevel(kError);
252-
Context ctx = createContext();
260+
WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
261+
Context ctx = createContext({},{},{
262+
.requiredLimits = &requiredLimits
263+
});
253264
Tensor dinp_t = createTensor(ctx, Shape{b * t * c}, kf32, dinp);
254265
Tensor dweight_t = createTensor(ctx, Shape{oc * c}, kf32, dweight);
255266
Tensor dbias_t = createTensor(ctx, Shape{oc}, kf32, dbias);

experimental/kernels/kernels_c.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ extern "C" {
44

55
// -- USE_GPU_FOR_* are the GPU/CPU switching flags for the kernels in llm.c. --
66

7-
// -- Note: Binding size (154389504) of encoder-Buffer is larger than the maximum binding size (134217728).
8-
// #define USE_GPU_FOR_ENCODER_FORWARD 1
7+
#define USE_GPU_FOR_ENCODER_FORWARD 1
98
// #define USE_GPU_FOR_ENCODER_BACKWARD 1
109
#define USE_GPU_FOR_LAYERNORM_FORWARD 1
1110
// -- Note: atomicAdd should be used with i32 or u32 not f32.

0 commit comments

Comments
 (0)