Add python bindings

junjihashimoto · junjihashimoto · commit 6e3a240822ae · 2024-12-25T22:47:56.000+09:00
diff --git a/bindings/python/Makefile b/bindings/python/Makefile
@@ -0,0 +1,26 @@
+CXX=clang++
+PYTHON=python3
+GPUCPP ?= $(PWD)/../..
+LIBDIR ?= $(GPUCPP)/third_party/lib
+LIBSPEC ?= . $(GPUCPP)/source
+
+ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/null 2>&1 ; echo $$?),0)
+    STDLIB :=
+else
+    STDLIB := -stdlib=libc++
+endif
+
+FLAGS=-shared -fPIC -std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib -ldawn \
+  `python3 -m pybind11 --includes` \
+  `python3-config --include --ldflags --embed`
+
+SUFFIX=$(shell $(PYTHON)-config --extension-suffix)
+
+gpu_cpp$(SUFFIX): gpu_cpp.cpp 
+	$(CXX) $(FLAGS) -o $@ $<
+	install_name_tool -change @rpath/libdawn.dylib $(LIBDIR)/libdawn.dylib gpu_cpp$(SUFFIX)
+
+test: test_gpu_cpp.py gpu_cpp$(SUFFIX)
+	$(PYTHON) test_gpu_cpp.py
+
+.PHONY: test
diff --git a/bindings/python/gpu_cpp.cpp b/bindings/python/gpu_cpp.cpp
@@ -0,0 +1,112 @@
+#include "gpu.hpp"
+#include <array>
+#include <cstdio>
+#include <future>
+
+using namespace gpu;
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+Shape vector_to_shape(const std::vector<int> &dims) {
+  switch(dims.size()){
+  case 1:
+    return Shape{(unsigned long)dims[0]};
+    break;
+  case 2:
+    return Shape{(unsigned long)dims[0],(unsigned long)dims[1]};
+    break;
+  case 3:
+    return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2]};
+    break;
+  case 4:
+    return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3]};
+    break;
+  case 5:
+    return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3],(unsigned long)dims[4]};
+    break;
+  }
+  return Shape{0};
+}
+
+Context* py_createContext() {
+  return new Context(createContext());
+}
+
+KernelCode* py_createKernelCode(const std::string &pData, size_t workgroupSize, int precision) {
+  return new KernelCode(pData, workgroupSize, (NumType)precision);
+}
+
+Kernel* py_createKernel(Context *ctx, const KernelCode *code,
+                        // const Tensor *dataBindings, size_t numTensors,
+                        const py::list& dataBindings_py,
+                        // const size_t *viewOffsets,
+                        const py::list& viewOffsets_py,
+                        const std::vector<int> &totalWorkgroups){
+  std::vector<Tensor> bindings;
+  for (auto item : dataBindings_py) {
+    bindings.push_back(item.cast<Tensor>());
+  }
+  std::vector<size_t> viewOffsets;
+  for (auto item : viewOffsets_py) {
+    viewOffsets.push_back(item.cast<size_t>());
+  }
+  return new Kernel(createKernel(*ctx, *code, bindings.data(), bindings.size(), viewOffsets.data(), vector_to_shape(totalWorkgroups)));
+}
+
+Tensor* py_createTensor(Context *ctx, const std::vector<int> &dims, int dtype) {
+  return new Tensor(createTensor(*ctx, vector_to_shape(dims), (NumType)dtype));
+}
+
+py::array_t<float> py_toCPU_float(Context *ctx, Tensor* tensor) {
+  auto result = py::array_t<float>(tensor->data.size/sizeof(float));
+  py::buffer_info buf = result.request();
+  toCPU(*ctx, *tensor, static_cast<float *>(buf.ptr), tensor->data.size);
+  return result;
+}
+
+
+void py_toGPU_float(Context *ctx, py::array_t<float> array, Tensor *tensor) {
+  py::buffer_info buf = array.request();
+  float *ptr = static_cast<float *>(buf.ptr);
+  toGPU(*ctx, ptr, *tensor);
+}
+
+
+struct GpuAsync {
+  std::promise<void> promise;
+  std::future<void> future ;
+  GpuAsync(): future(promise.get_future()){
+  }
+};
+
+GpuAsync* py_dispatchKernel(Context *ctx, Kernel *kernel) {
+  auto async = new GpuAsync();
+  dispatchKernel(*ctx, *kernel, async->promise);
+  return async;
+}
+
+void py_wait(Context *ctx, GpuAsync* async) {
+  wait(*ctx, async->future);
+}
+
+PYBIND11_MODULE(gpu_cpp, m) {
+    m.doc() = "gpu.cpp plugin";
+    py::class_<Context>(m, "Context");
+    py::class_<Tensor>(m, "Tensor");
+    py::class_<Kernel>(m, "Kernel");
+    py::class_<KernelCode>(m, "KernelCode");
+    py::class_<GpuAsync>(m, "GpuAsync");
+    m.def("create_context", &py_createContext, py::return_value_policy::take_ownership);
+    m.def("create_tensor", &py_createTensor, py::return_value_policy::take_ownership);
+    m.def("create_kernel", &py_createKernel, py::return_value_policy::take_ownership);
+    m.def("create_kernel_code", &py_createKernelCode, py::return_value_policy::take_ownership);
+    m.def("dispatch_kernel", &py_dispatchKernel, py::return_value_policy::take_ownership);
+    m.def("wait", &py_wait, "Wait for GPU");
+    m.def("to_cpu_float", &py_toCPU_float);
+    m.def("to_gpu_float", &py_toGPU_float);
+    m.attr("kf32") = (int)kf32;
+}
diff --git a/bindings/python/test_gpu_cpp.py b/bindings/python/test_gpu_cpp.py
@@ -0,0 +1,39 @@
+import gpu_cpp as gpu
+import numpy as np
+
+ctx = gpu.create_context()
+
+N = 12
+
+input = gpu.create_tensor(ctx, [N], gpu.kf32)
+output = gpu.create_tensor(ctx, [N], gpu.kf32)
+kernel_code = gpu.create_kernel_code(
+    """
+    const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
+    @group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+    @group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+    @group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
+    @compute @workgroup_size({{workgroupSize}})
+    fn main(
+        @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+        let i: u32 = GlobalInvocationID.x;
+        if (i < arrayLength(&inp)) {
+            let x: f32 = inp[i];
+            out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR 
+                     * (x + .044715 * x * x * x))), x, x > 10.0);
+        }
+    }
+    """,
+    256,
+    gpu.kf32
+    )
+
+kernel = gpu.create_kernel(ctx, kernel_code, [input, output], [0,0], [12,1,1])
+
+gpu.to_gpu_float(ctx, np.array([1,2,3,4,1,2,3,4,1,2,3,4],np.float32), input)
+
+gpu_async = gpu.dispatch_kernel(ctx, kernel);
+
+gpu.wait(ctx, gpu_async);
+
+print(gpu.to_cpu_float(ctx, output))