@@ -171,15 +171,57 @@ void MATMUL_FORWARD_GPU(float* out,
171171 unsigned long c = static_cast <unsigned long >(C);
172172 unsigned long oc = static_cast <unsigned long >(OC);
173173 setLogLevel (kError );
174- Context ctx = createContext ();
175- Tensor inp_t = createTensor (ctx, Shape{b * t * c}, kf32, inp);
176- Tensor weight_t = createTensor (ctx, Shape{oc * c}, kf32, weight);
177- Tensor bias_t = createTensor (ctx, Shape{oc}, kf32, bias);
178- Tensor out_t = createTensor (ctx, Shape{b * t * oc}, kf32);
174+ // See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
175+ WGPURequiredLimits requiredLimits = {
176+ .limits = {
177+ .maxTextureDimension1D =8192 ,
178+ .maxTextureDimension2D =8192 ,
179+ .maxTextureDimension3D =2048 ,
180+ .maxTextureArrayLayers =256 ,
181+ .maxBindGroups =4 ,
182+ .maxBindGroupsPlusVertexBuffers =24 ,
183+ .maxBindingsPerBindGroup =1000 ,
184+ .maxDynamicUniformBuffersPerPipelineLayout =8 ,
185+ .maxDynamicStorageBuffersPerPipelineLayout =4 ,
186+ .maxSampledTexturesPerShaderStage =16 ,
187+ .maxSamplersPerShaderStage =16 ,
188+ .maxStorageBuffersPerShaderStage =8 ,
189+ .maxStorageTexturesPerShaderStage =4 ,
190+ .maxUniformBuffersPerShaderStage =12 ,
191+ .maxUniformBufferBindingSize =65536 ,
192+ .maxStorageBufferBindingSize =1073741824 ,
193+ .minUniformBufferOffsetAlignment =256 ,
194+ .minStorageBufferOffsetAlignment =256 ,
195+ .maxVertexBuffers =8 ,
196+ .maxBufferSize =0x80000000 ,
197+ .maxVertexAttributes =16 ,
198+ .maxVertexBufferArrayStride =2048 ,
199+ .maxInterStageShaderComponents =64 ,
200+ .maxInterStageShaderVariables =16 ,
201+ .maxColorAttachments =8 ,
202+ .maxColorAttachmentBytesPerSample =32 ,
203+ .maxComputeWorkgroupStorageSize =16384 ,
204+ .maxComputeInvocationsPerWorkgroup =256 ,
205+ .maxComputeWorkgroupSizeX =256 ,
206+ .maxComputeWorkgroupSizeY =256 ,
207+ .maxComputeWorkgroupSizeZ =64 ,
208+ .maxComputeWorkgroupsPerDimension =65535
209+ },
210+ .nextInChain = nullptr
211+ };
212+ Context ctx = createContext ({},{},{
213+ .requiredLimits = &requiredLimits
214+ });
215+
216+ Tensor inp_i = createTensor (ctx, Shape{b * t * c}, kf32, inp);
217+ Tensor weight_i = createTensor (ctx, Shape{oc * c}, kf32, weight);
218+ Tensor bias_i = bias == NULL ? createTensor (ctx, Shape{1 }, kf32) : createTensor (ctx, Shape{oc}, kf32, bias);
219+ Tensor out_o = createTensor (ctx, Shape{b * t * oc}, kf32);
179220 std::promise<void > promise;
180221 std::future<void > future = promise.get_future ();
222+ assert ( (b*t) % 256 == 0 );
181223 Kernel op = createKernel (ctx, {kShaderMatmul , 256 , kf32},
182- Bindings{inp_t , weight_t , bias_t , out_t },
224+ Bindings{inp_i, weight_i, bias_i, out_o },
183225 /* nWorkgroups */ {cdiv (b * t, 256 ), 1 , 1 },
184226 /* params */
185227 MatmulParams{
@@ -190,7 +232,7 @@ void MATMUL_FORWARD_GPU(float* out,
190232 });
191233 dispatchKernel (ctx, op, promise);
192234 wait (ctx, future);
193- toCPU (ctx, out_t , out, b * t * oc * sizeof (float ));
235+ toCPU (ctx, out_o , out, b * t * oc * sizeof (float ));
194236}
195237
196238void MATMUL_BACKWARD_GPU (float * dinp, float * dweight, float * dbias,
0 commit comments