Skip to content

Commit 6e3a240

Browse files
Add python bindings
1 parent 9a42592 commit 6e3a240

3 files changed

Lines changed: 177 additions & 0 deletions

File tree

bindings/python/Makefile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
CXX=clang++
2+
PYTHON=python3
3+
GPUCPP ?= $(PWD)/../..
4+
LIBDIR ?= $(GPUCPP)/third_party/lib
5+
LIBSPEC ?= . $(GPUCPP)/source
6+
7+
ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/null 2>&1 ; echo $$?),0)
8+
STDLIB :=
9+
else
10+
STDLIB := -stdlib=libc++
11+
endif
12+
13+
FLAGS=-shared -fPIC -std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib -ldawn \
14+
`python3 -m pybind11 --includes` \
15+
`python3-config --include --ldflags --embed`
16+
17+
SUFFIX=$(shell $(PYTHON)-config --extension-suffix)
18+
19+
gpu_cpp$(SUFFIX): gpu_cpp.cpp
20+
$(CXX) $(FLAGS) -o $@ $<
21+
install_name_tool -change @rpath/libdawn.dylib $(LIBDIR)/libdawn.dylib gpu_cpp$(SUFFIX)
22+
23+
test: test_gpu_cpp.py gpu_cpp$(SUFFIX)
24+
$(PYTHON) test_gpu_cpp.py
25+
26+
.PHONY: test

bindings/python/gpu_cpp.cpp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#include "gpu.hpp"
2+
#include <array>
3+
#include <cstdio>
4+
#include <future>
5+
6+
using namespace gpu;
7+
8+
#include <pybind11/pybind11.h>
9+
#include <pybind11/numpy.h>
10+
#include <pybind11/stl.h>
11+
12+
namespace py = pybind11;
13+
14+
Shape vector_to_shape(const std::vector<int> &dims) {
15+
switch(dims.size()){
16+
case 1:
17+
return Shape{(unsigned long)dims[0]};
18+
break;
19+
case 2:
20+
return Shape{(unsigned long)dims[0],(unsigned long)dims[1]};
21+
break;
22+
case 3:
23+
return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2]};
24+
break;
25+
case 4:
26+
return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3]};
27+
break;
28+
case 5:
29+
return Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3],(unsigned long)dims[4]};
30+
break;
31+
}
32+
return Shape{0};
33+
}
34+
35+
Context* py_createContext() {
36+
return new Context(createContext());
37+
}
38+
39+
KernelCode* py_createKernelCode(const std::string &pData, size_t workgroupSize, int precision) {
40+
return new KernelCode(pData, workgroupSize, (NumType)precision);
41+
}
42+
43+
Kernel* py_createKernel(Context *ctx, const KernelCode *code,
44+
// const Tensor *dataBindings, size_t numTensors,
45+
const py::list& dataBindings_py,
46+
// const size_t *viewOffsets,
47+
const py::list& viewOffsets_py,
48+
const std::vector<int> &totalWorkgroups){
49+
std::vector<Tensor> bindings;
50+
for (auto item : dataBindings_py) {
51+
bindings.push_back(item.cast<Tensor>());
52+
}
53+
std::vector<size_t> viewOffsets;
54+
for (auto item : viewOffsets_py) {
55+
viewOffsets.push_back(item.cast<size_t>());
56+
}
57+
return new Kernel(createKernel(*ctx, *code, bindings.data(), bindings.size(), viewOffsets.data(), vector_to_shape(totalWorkgroups)));
58+
}
59+
60+
Tensor* py_createTensor(Context *ctx, const std::vector<int> &dims, int dtype) {
61+
return new Tensor(createTensor(*ctx, vector_to_shape(dims), (NumType)dtype));
62+
}
63+
64+
py::array_t<float> py_toCPU_float(Context *ctx, Tensor* tensor) {
65+
auto result = py::array_t<float>(tensor->data.size/sizeof(float));
66+
py::buffer_info buf = result.request();
67+
toCPU(*ctx, *tensor, static_cast<float *>(buf.ptr), tensor->data.size);
68+
return result;
69+
}
70+
71+
72+
void py_toGPU_float(Context *ctx, py::array_t<float> array, Tensor *tensor) {
73+
py::buffer_info buf = array.request();
74+
float *ptr = static_cast<float *>(buf.ptr);
75+
toGPU(*ctx, ptr, *tensor);
76+
}
77+
78+
79+
struct GpuAsync {
80+
std::promise<void> promise;
81+
std::future<void> future ;
82+
GpuAsync(): future(promise.get_future()){
83+
}
84+
};
85+
86+
GpuAsync* py_dispatchKernel(Context *ctx, Kernel *kernel) {
87+
auto async = new GpuAsync();
88+
dispatchKernel(*ctx, *kernel, async->promise);
89+
return async;
90+
}
91+
92+
void py_wait(Context *ctx, GpuAsync* async) {
93+
wait(*ctx, async->future);
94+
}
95+
96+
PYBIND11_MODULE(gpu_cpp, m) {
97+
m.doc() = "gpu.cpp plugin";
98+
py::class_<Context>(m, "Context");
99+
py::class_<Tensor>(m, "Tensor");
100+
py::class_<Kernel>(m, "Kernel");
101+
py::class_<KernelCode>(m, "KernelCode");
102+
py::class_<GpuAsync>(m, "GpuAsync");
103+
m.def("create_context", &py_createContext, py::return_value_policy::take_ownership);
104+
m.def("create_tensor", &py_createTensor, py::return_value_policy::take_ownership);
105+
m.def("create_kernel", &py_createKernel, py::return_value_policy::take_ownership);
106+
m.def("create_kernel_code", &py_createKernelCode, py::return_value_policy::take_ownership);
107+
m.def("dispatch_kernel", &py_dispatchKernel, py::return_value_policy::take_ownership);
108+
m.def("wait", &py_wait, "Wait for GPU");
109+
m.def("to_cpu_float", &py_toCPU_float);
110+
m.def("to_gpu_float", &py_toGPU_float);
111+
m.attr("kf32") = (int)kf32;
112+
}

bindings/python/test_gpu_cpp.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import gpu_cpp as gpu
2+
import numpy as np
3+
4+
ctx = gpu.create_context()
5+
6+
N = 12
7+
8+
input = gpu.create_tensor(ctx, [N], gpu.kf32)
9+
output = gpu.create_tensor(ctx, [N], gpu.kf32)
10+
kernel_code = gpu.create_kernel_code(
11+
"""
12+
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
13+
@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
14+
@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
15+
@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
16+
@compute @workgroup_size({{workgroupSize}})
17+
fn main(
18+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
19+
let i: u32 = GlobalInvocationID.x;
20+
if (i < arrayLength(&inp)) {
21+
let x: f32 = inp[i];
22+
out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR
23+
* (x + .044715 * x * x * x))), x, x > 10.0);
24+
}
25+
}
26+
""",
27+
256,
28+
gpu.kf32
29+
)
30+
31+
kernel = gpu.create_kernel(ctx, kernel_code, [input, output], [0,0], [12,1,1])
32+
33+
gpu.to_gpu_float(ctx, np.array([1,2,3,4,1,2,3,4,1,2,3,4],np.float32), input)
34+
35+
gpu_async = gpu.dispatch_kernel(ctx, kernel);
36+
37+
gpu.wait(ctx, gpu_async);
38+
39+
print(gpu.to_cpu_float(ctx, output))

0 commit comments

Comments
 (0)