Skip to content

Commit 3228b1b

Browse files
Add haskell bindings
1 parent 6e3a240 commit 3228b1b

8 files changed

Lines changed: 390 additions & 1 deletion

File tree

bindings/haskell/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Revision history for gpu-cpp
2+
3+
## 0.1.0.0 -- 2024-12-28
4+
5+
* First version.

bindings/haskell/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
all:
2+
cabal configure --extra-include-dirs=$(PWD)/../.. --extra-include-dirs=$(PWD)/../../third_party/headers --extra-lib-dirs=$(PWD)/../../third_party/lib
3+
cabal build .

bindings/haskell/app/Main.hs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
module Main where
2+
3+
import GpuCpp.Types
4+
import GpuCpp
5+
import qualified Data.Vector.Storable as V
6+
import Foreign.C.Types
7+
8+
main :: IO ()
9+
main = do
10+
context <- createContext
11+
input <- createTensor context [12] kf32
12+
output <- createTensor context [12] kf32
13+
kernelCode <- createKernelCode
14+
(
15+
"const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)\n" <>
16+
"@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;\n" <>
17+
"@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;\n" <>
18+
"@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;\n" <>
19+
"@compute @workgroup_size({{workgroupSize}})\n" <>
20+
"fn main(\n" <>
21+
" @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {\n" <>
22+
" let i: u32 = GlobalInvocationID.x;\n" <>
23+
" if (i < arrayLength(&inp)) {\n" <>
24+
" let x: f32 = inp[i];\n" <>
25+
" out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR \n" <>
26+
" * (x + .044715 * x * x * x))), x, x > 10.0);\n" <>
27+
" }\n" <>
28+
"}\n"
29+
)
30+
256
31+
kf32
32+
kernel <- createKernel context kernelCode [input, output] [0,0] [12,1,1]
33+
toGpu context (V.fromList [1 :: CFloat,2,3,4,1,2,3,4,1,2,3,4]) input
34+
async <- dispatchKernel context kernel
35+
wait context async
36+
vec <- toCpu context output :: IO (V.Vector CFloat)
37+
print vec

bindings/haskell/gpu-cpp.cabal

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
cabal-version: 3.0
2+
name: gpu-cpp
3+
version: 0.1.0.0
4+
license: BSD-3-Clause
5+
author: Junji Hashimoto
6+
maintainer: junji.hashimoto@gmail.com
7+
category: Math
8+
build-type: Simple
9+
10+
extra-doc-files: CHANGELOG.md
11+
12+
common warnings
13+
ghc-options: -Wall
14+
15+
library
16+
import: warnings
17+
exposed-modules: GpuCpp
18+
, GpuCpp.Types
19+
build-depends: base ^>=4.18.1.0
20+
, inline-c
21+
, inline-c-cpp
22+
, containers
23+
, template-haskell
24+
, safe-exceptions
25+
, vector
26+
hs-source-dirs: src
27+
default-language: Haskell2010
28+
ghc-options: -optcxx-std=c++17
29+
extra-libraries: dawn
30+
31+
executable gpu-cpp
32+
import: warnings
33+
main-is: Main.hs
34+
build-depends: base ^>=4.18.1.0
35+
, gpu-cpp
36+
, vector
37+
hs-source-dirs: app
38+
default-language: Haskell2010
39+
40+
test-suite gpu-cpp-test
41+
import: warnings
42+
default-language: Haskell2010
43+
type: exitcode-stdio-1.0
44+
hs-source-dirs: test
45+
main-is: Main.hs
46+
build-depends: base ^>=4.18.1.0
47+
, gpu-cpp
48+
, vector
49+
, hspec

bindings/haskell/src/GpuCpp.hs

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
{-# LANGUAGE DataKinds #-}
2+
{-# LANGUAGE PolyKinds #-}
3+
{-# LANGUAGE TemplateHaskell #-}
4+
{-# LANGUAGE QuasiQuotes #-}
5+
{-# LANGUAGE OverloadedStrings #-}
6+
{-# LANGUAGE ScopedTypeVariables #-}
7+
{-# LANGUAGE TypeApplications #-}
8+
{-# LANGUAGE MultiParamTypeClasses #-}
9+
{-# LANGUAGE FlexibleInstances #-}
10+
11+
module GpuCpp where
12+
13+
import qualified Language.C.Inline.Cpp as C
14+
import qualified Language.C.Inline.Cpp.Unsafe as C
15+
import qualified Language.C.Inline.Context as C
16+
import Foreign.C.String
17+
import Foreign.C.Types
18+
import GHC.Int
19+
import GHC.ForeignPtr(mallocPlainForeignPtrBytes)
20+
import Foreign
21+
import Control.Monad (forM_)
22+
import GpuCpp.Types
23+
import Control.Exception.Safe (bracket)
24+
import qualified Data.Vector.Storable as V
25+
26+
C.context $ C.cppCtx <> mempty { C.ctxTypesTable = typeTable }
27+
28+
C.include "<gpu.hpp>"
29+
C.include "<future>"
30+
C.include "<vector>"
31+
32+
[C.emitBlock|
33+
struct GpuAsync {
34+
std::promise<void> promise;
35+
std::future<void> future;
36+
GpuAsync(): future(promise.get_future()){
37+
}
38+
};
39+
40+
gpu::Shape vector_to_shape(const std::vector<int64_t> &dims) {
41+
switch(dims.size()){
42+
case 1:
43+
return gpu::Shape{(unsigned long)dims[0]};
44+
break;
45+
case 2:
46+
return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1]};
47+
break;
48+
case 3:
49+
return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2]};
50+
break;
51+
case 4:
52+
return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3]};
53+
break;
54+
case 5:
55+
return gpu::Shape{(unsigned long)dims[0],(unsigned long)dims[1],(unsigned long)dims[2],(unsigned long)dims[3],(unsigned long)dims[4]};
56+
break;
57+
}
58+
return gpu::Shape{0};
59+
}
60+
|]
61+
62+
kf32 :: CInt
63+
kf32 = [C.pure| int { (int)gpu::kf32 } |]
64+
65+
createContext :: IO (ForeignPtr Context)
66+
createContext =
67+
[C.throwBlock| gpu::Context* { return new gpu::Context(gpu::createContext()); }|] >>=
68+
newForeignPtr
69+
[C.funPtr| void deleteContext(gpu::Context* ptr) { delete ptr; }|]
70+
71+
72+
createKernelCode :: String -> CInt -> CInt -> IO (ForeignPtr KernelCode)
73+
createKernelCode kernelString workgroupSize precision =
74+
withCString kernelString $ \pData ->
75+
[C.throwBlock| gpu::KernelCode* { return new gpu::KernelCode($(char* pData), $(int workgroupSize), (gpu::NumType)$(int precision)); }|] >>=
76+
newForeignPtr
77+
[C.funPtr| void deleteKernelCode(gpu::KernelCode* ptr) { delete ptr; }|]
78+
79+
80+
dispatchKernel :: ForeignPtr Context -> ForeignPtr Kernel -> IO (ForeignPtr GpuAsync)
81+
dispatchKernel context kernel =
82+
withForeignPtr context $ \c ->
83+
withForeignPtr kernel $ \k ->
84+
[C.throwBlock| GpuAsync* {
85+
auto async = new GpuAsync();
86+
gpu::dispatchKernel(*$(gpu::Context* c), *$(gpu::Kernel* k), async->promise);
87+
return async; }|] >>=
88+
newForeignPtr
89+
[C.funPtr| void deleteGpuAsync(GpuAsync* ptr) { delete ptr; }|]
90+
91+
wait :: ForeignPtr Context -> ForeignPtr GpuAsync -> IO ()
92+
wait context async =
93+
withForeignPtr context $ \c ->
94+
withForeignPtr async $ \a ->
95+
[C.throwBlock| void {
96+
gpu::wait(*$(gpu::Context* c), $(GpuAsync* a)->future);
97+
}|]
98+
99+
instance WithVector CInt Int64 where
100+
withVector shape func =
101+
bracket
102+
(do
103+
let len = fromIntegral $ length shape
104+
vec <- [C.throwBlock| std::vector<int64_t>* {
105+
return new std::vector<int64_t>($(int len));
106+
}|]
107+
ptr <- [C.throwBlock| int64_t* {
108+
return $(std::vector<int64_t>* vec)->data();
109+
}|]
110+
pokeArray ptr (map fromIntegral shape)
111+
return vec
112+
)
113+
(\vec -> [C.block| void { delete $(std::vector<int64_t>* vec); }|])
114+
(\vec -> func vec)
115+
116+
instance WithVector CInt CSize where
117+
withVector shape func =
118+
bracket
119+
(do
120+
let len = fromIntegral $ length shape
121+
vec <- [C.throwBlock| std::vector<size_t>* {
122+
return new std::vector<size_t>($(int len));
123+
}|]
124+
ptr <- [C.throwBlock| size_t* {
125+
return $(std::vector<size_t>* vec)->data();
126+
}|]
127+
pokeArray ptr (map fromIntegral shape)
128+
return vec
129+
)
130+
(\vec -> [C.block| void { delete $(std::vector<size_t>* vec); }|])
131+
(\vec -> func vec)
132+
133+
instance WithVector (Ptr Tensor) Tensor where
134+
withVector ptrs func =
135+
bracket (do
136+
vec <- [C.throwBlock| std::vector<gpu::Tensor>* { return new std::vector<gpu::Tensor>(); }|]
137+
forM_ ptrs $ do
138+
\ptr -> [C.throwBlock| void { $(std::vector<gpu::Tensor>* vec)->push_back(*$(gpu::Tensor* ptr)); }|]
139+
return vec
140+
)
141+
(\vec -> [C.block| void { delete $(std::vector<gpu::Tensor>* vec); }|])
142+
(\vec -> func vec)
143+
144+
withForeignPtrs :: [ForeignPtr a] -> ([Ptr a] -> IO b) -> IO b
145+
withForeignPtrs [] func = func []
146+
withForeignPtrs (x:xs) func =
147+
withForeignPtr x $ \x' ->
148+
withForeignPtrs xs $ \xs' ->
149+
func (x':xs')
150+
151+
createKernel :: ForeignPtr Context -> ForeignPtr KernelCode -> [ForeignPtr Tensor] -> [Int] -> [Int] -> IO (ForeignPtr Kernel)
152+
createKernel context kernelCode dataBindings viewOffsets totalWorkgroups =
153+
withForeignPtr context $ \c ->
154+
withForeignPtr kernelCode $ \k ->
155+
withForeignPtrs dataBindings $ \b ->
156+
withVector b $ \b' ->
157+
withVector @CInt (map fromIntegral viewOffsets) $ \v ->
158+
withVector @CInt (map fromIntegral totalWorkgroups) $ \w ->
159+
[C.throwBlock| gpu::Kernel* {
160+
return new gpu::Kernel(gpu::createKernel(
161+
*$(gpu::Context* c),
162+
*$(gpu::KernelCode* k),
163+
$(std::vector<gpu::Tensor>* b')->data(),
164+
$(std::vector<gpu::Tensor>* b')->size(),
165+
$(std::vector<size_t>* v)->data(),
166+
vector_to_shape(*$(std::vector<int64_t>* w))));
167+
}|] >>=
168+
newForeignPtr
169+
[C.funPtr| void deleteKernel(gpu::Kernel* ptr) { delete ptr; }|]
170+
171+
createTensor :: ForeignPtr Context -> [CInt] -> CInt -> IO (ForeignPtr Tensor)
172+
createTensor context shape dtype =
173+
withVector shape $ \s ->
174+
withForeignPtr context $ \c ->
175+
[C.throwBlock| gpu::Tensor* {
176+
return new gpu::Tensor(gpu::createTensor(*$(gpu::Context* c), vector_to_shape(*$(std::vector<int64_t>* s)), (gpu::NumType)$(int dtype)));
177+
}|] >>=
178+
newForeignPtr
179+
[C.funPtr| void deleteTensor(gpu::Tensor* ptr) { delete ptr; }|]
180+
181+
createVector :: forall a. Storable a => Int -> IO (V.Vector a)
182+
createVector n = do
183+
ptr <- mallocPlainForeignPtrBytes (n * sizeOf (undefined :: a))
184+
return $ V.unsafeFromForeignPtr ptr 0 n
185+
186+
instance GpuStorable CFloat where
187+
toGpu context array tensor =
188+
withForeignPtr context $ \c ->
189+
withForeignPtr tensor $ \t ->
190+
V.unsafeWith array $ \ptr ->
191+
[C.throwBlock| void {
192+
gpu::toGPU(*$(gpu::Context* c), $(float* ptr), *$(gpu::Tensor* t));
193+
}|]
194+
toCpu context tensor =
195+
withForeignPtr context $ \c ->
196+
withForeignPtr tensor $ \t -> do
197+
(size :: CInt) <- [C.block| int {
198+
size_t u = sizeof(float);
199+
size_t len = $(gpu::Tensor* t)->data.size;
200+
return len/u;
201+
}|]
202+
array <- createVector (fromIntegral size)
203+
V.unsafeWith array $ \ptr ->
204+
[C.throwBlock| void {
205+
gpu::toCPU(*$(gpu::Context* c), *$(gpu::Tensor* t), $(float* ptr), $(int size) * sizeof(float));
206+
}|]
207+
return array
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{-# LANGUAGE DataKinds #-}
2+
{-# LANGUAGE PolyKinds #-}
3+
{-# LANGUAGE TemplateHaskell #-}
4+
{-# LANGUAGE QuasiQuotes #-}
5+
{-# LANGUAGE OverloadedStrings #-}
6+
{-# LANGUAGE MultiParamTypeClasses #-}
7+
8+
module GpuCpp.Types where
9+
10+
import qualified Language.C.Types as C
11+
import qualified Language.Haskell.TH.Lib as TH
12+
import qualified Data.Map as Map
13+
import Foreign
14+
import qualified Data.Vector.Storable as V
15+
16+
data Context
17+
data Tensor
18+
data Kernel
19+
data KernelCode
20+
data GpuAsync
21+
data StdVector a
22+
23+
typeTable :: Map.Map C.TypeSpecifier TH.TypeQ
24+
typeTable = Map.fromList [
25+
(C.TypeName "gpu::Context", [t|Context|])
26+
, (C.TypeName "gpu::Tensor", [t|Tensor|])
27+
, (C.TypeName "gpu::Kernel", [t|Kernel|])
28+
, (C.TypeName "gpu::KernelCode", [t|KernelCode|])
29+
, (C.TypeName "GpuAsync", [t|GpuAsync|])
30+
, (C.TypeName "std::vector", [t|StdVector|])
31+
]
32+
33+
34+
class WithVector a b where
35+
withVector :: [a] -> (Ptr (StdVector b) -> IO c) -> IO c
36+
37+
class GpuStorable a where
38+
toGpu :: ForeignPtr Context -> V.Vector a -> ForeignPtr Tensor -> IO ()
39+
toCpu :: ForeignPtr Context -> ForeignPtr Tensor -> IO (V.Vector a)
40+

bindings/haskell/test/Main.hs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
module Main (main) where
2+
3+
import Test.Hspec
4+
import GpuCpp.Types
5+
import GpuCpp
6+
import qualified Data.Vector.Storable as V
7+
import Foreign.C.Types
8+
9+
gelu :: String
10+
gelu= "const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)\n" <>
11+
"@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;\n" <>
12+
"@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;\n" <>
13+
"@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;\n" <>
14+
"@compute @workgroup_size({{workgroupSize}})\n" <>
15+
"fn main(\n" <>
16+
" @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {\n" <>
17+
" let i: u32 = GlobalInvocationID.x;\n" <>
18+
" if (i < arrayLength(&inp)) {\n" <>
19+
" let x: f32 = inp[i];\n" <>
20+
" out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR \n" <>
21+
" * (x + .044715 * x * x * x))), x, x > 10.0);\n" <>
22+
" }\n" <>
23+
"}\n"
24+
25+
main :: IO ()
26+
main = do
27+
hspec $ do
28+
describe "toCPU and toGPU" $ do
29+
it "writes and reads back" $ do
30+
context <- createContext
31+
input <- createTensor context [12] kf32
32+
toGpu context (V.fromList [1 :: CFloat,2,3,4,1,2,3,4,1,2,3,4]) input
33+
output <- toCpu context input :: IO (V.Vector CFloat)
34+
V.toList output `shouldBe` [1,2,3,4,1,2,3,4,1,2,3,4]
35+
describe "call kernel" $ do
36+
it "gelu" $ do
37+
context <- createContext
38+
input <- createTensor context [12] kf32
39+
output <- createTensor context [12] kf32
40+
kernelCode <- createKernelCode gelu 256 kf32
41+
kernel <- createKernel context kernelCode [input, output] [0,0] [12,1,1]
42+
toGpu context (V.fromList [1 :: CFloat,2,3,4,1,2,3,4,1,2,3,4]) input
43+
async <- dispatchKernel context kernel
44+
wait context async
45+
vec <- toCpu context output :: IO (V.Vector CFloat)
46+
V.toList (V.zipWith (\a b -> abs (a - b))
47+
vec
48+
(V.fromList [0.841192,1.9545977,2.9963627,3.9999297,0.841192,1.9545977,2.9963627,3.9999297,0.841192,1.9545977,2.9963627,3.9999297]))
49+
`shouldSatisfy` all (< 0.001)

0 commit comments

Comments
 (0)