Skip to content

Commit 1723b03

Browse files
authored
Merge pull request #17 from JuliaGPU/jps/cuda-ipc
Optimize CuArrayDeviceProc with IPC and DtoD
2 parents 3a3aeba + 575dc00 commit 1723b03

5 files changed

Lines changed: 111 additions & 56 deletions

File tree

.gitlab-ci.yml

Lines changed: 0 additions & 31 deletions
This file was deleted.

Project.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
name = "DaggerGPU"
22
uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
33
authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
4-
version = "0.1.2"
4+
version = "0.1.3"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
88
Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
99
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
1010
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
11+
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
1112
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
13+
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
1214

1315
[compat]
1416
Adapt = "1, 2, 3"
15-
Dagger = "0.10, 0.11"
16-
KernelAbstractions = "0.4"
17+
Dagger = "0.13.3"
18+
KernelAbstractions = "0.5, 0.6, 0.7"
1719
Requires = "1"
1820
julia = "1"
1921

src/DaggerGPU.jl

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
module DaggerGPU
22

3-
using Dagger, Requires, Adapt
3+
using Dagger, MemPool, Requires, Adapt
44
using Distributed
55
using KernelAbstractions
66

7+
import Dagger: Chunk
8+
79
macro gpuproc(PROC, T)
810
quote
911
# Assume that we can run anything
@@ -14,8 +16,28 @@ macro gpuproc(PROC, T)
1416
Dagger.iscompatible_arg(proc::Dagger.ThreadProc, opts, x::$T) = false
1517

1618
# Adapt to/from the appropriate type
17-
Dagger.move(from_proc::OSProc, to_proc::$PROC, x) = adapt($T, x)
18-
Dagger.move(from_proc::$PROC, to_proc::OSProc, x) = adapt(Array, x)
19+
function Dagger.move(from_proc::OSProc, to_proc::$PROC, x::Chunk)
20+
from_pid = from_proc.pid
21+
to_pid = Dagger.get_parent(to_proc).pid
22+
@assert myid() == to_pid
23+
adapt($T, remotecall_fetch(from_pid, x) do x
24+
poolget(x.handle)
25+
end)
26+
end
27+
function Dagger.move(from_proc::$PROC, to_proc::OSProc, x::Chunk)
28+
from_pid = Dagger.get_parent(from_proc).pid
29+
to_pid = to_proc.pid
30+
@assert myid() == to_pid
31+
remotecall_fetch(from_pid, x) do x
32+
adapt(Array, poolget(x.handle))
33+
end
34+
end
35+
function Dagger.move(from_proc::OSProc, to_proc::$PROC, x)
36+
adapt($T, x)
37+
end
38+
function Dagger.move(from_proc::$PROC, to_proc::OSProc, x)
39+
adapt(Array, x)
40+
end
1941
end
2042
end
2143

@@ -31,7 +53,7 @@ function __init__()
3153
@require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
3254
include("cu.jl")
3355
end
34-
@require ROCArrays="ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc" begin
56+
@require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin
3557
include("roc.jl")
3658
end
3759
end

src/cu.jl

Lines changed: 76 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,104 @@
11
using .CUDA
22
import .CUDA: CuDevice, CuContext, devices, attribute
33

4+
using UUIDs
5+
46
export CuArrayDeviceProc
57

68
"Represents a single CUDA GPU device."
79
struct CuArrayDeviceProc <: Dagger.Processor
810
owner::Int
9-
#ctx::CuContext
1011
device::Int
12+
device_uuid::UUID
1113
end
1214
@gpuproc(CuArrayDeviceProc, CuArray)
13-
#= FIXME: DtoD copies and CUDA IPC
14-
function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
15-
if from === to
16-
return x
15+
Dagger.get_parent(proc::CuArrayDeviceProc) = Dagger.OSProc(proc.owner)
16+
17+
# function can_access(this, peer)
18+
# status = Ref{Cint}()
19+
# CUDA.cuDeviceCanAccessPeer(status, this, peer)
20+
# return status[] == 1
21+
# end
22+
23+
function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.Chunk{T}) where T<:CuArray
24+
if from == to
25+
# Same process and GPU, no change
26+
poolget(x.handle)
27+
elseif from.owner == to.owner
28+
# Same process but different GPUs, use DtoD copy
29+
from_arr = poolget(x.handle)
30+
to_arr = CUDA.device!(to.device) do
31+
CuArray{T,N}(undef, size)
32+
end
33+
copyto!(to_arr, from_arr)
34+
to_arr
35+
elseif Dagger.system_uuid(from.owner) == Dagger.system_uuid(to.owner)
36+
# Same node, we can use IPC
37+
ipc_handle, eT, shape = remotecall_fetch(from.owner, x.handle) do h
38+
arr = poolget(h)
39+
ipc_handle_ref = Ref{CUDA.CUipcMemHandle}()
40+
GC.@preserve arr begin
41+
CUDA.cuIpcGetMemHandle(ipc_handle_ref, pointer(arr))
42+
end
43+
(ipc_handle_ref[], eltype(arr), size(arr))
44+
end
45+
r_ptr = Ref{CUDA.CUdeviceptr}()
46+
CUDA.device!(from.device) do # FIXME: Assumes that device IDs are identical across processes
47+
CUDA.cuIpcOpenMemHandle(r_ptr, ipc_handle, CUDA.CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
48+
end
49+
ptr = Base.unsafe_convert(CUDA.CuPtr{eT}, r_ptr[])
50+
arr = unsafe_wrap(CuArray, ptr, shape; own=false)
51+
finalizer(arr) do arr
52+
CUDA.cuIpcCloseMemHandle(pointer(arr))
53+
end
54+
if from.device_uuid != to.device_uuid
55+
CUDA.device!(to.device) do
56+
to_arr = similar(arr)
57+
copyto!(to_arr, arr)
58+
to_arr
59+
end
60+
else
61+
arr
62+
end
1763
else
18-
error("Not implemented")
64+
# Different node, use DtoH, serialization, HtoD
65+
# TODO UCX
66+
CuArray(remotecall_fetch(from.owner, x.handle) do h
67+
Array(poolget(h))
68+
end)
1969
end
2070
end
21-
=#
71+
2272
function Dagger.execute!(proc::CuArrayDeviceProc, func, args...)
23-
fetch(Threads.@spawn begin
24-
task_local_storage(:processor, proc)
73+
tls = Dagger.get_tls()
74+
task = Threads.@spawn begin
75+
Dagger.set_tls!(tls)
2576
CUDA.device!(proc.device)
2677
CUDA.@sync func(args...)
27-
end)
78+
end
79+
try
80+
fetch(task)
81+
catch err
82+
@static if VERSION >= v"1.1"
83+
stk = Base.catch_stack(task)
84+
err, frames = stk[1]
85+
rethrow(CapturedException(err, frames))
86+
else
87+
rethrow(task.result)
88+
end
89+
end
2890
end
2991
Base.show(io::IO, proc::CuArrayDeviceProc) =
30-
print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device)")
92+
print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device), uuid $(proc.device_uuid)")
3193

3294
processor(::Val{:CUDA}) = CuArrayDeviceProc
3395
cancompute(::Val{:CUDA}) = CUDA.has_cuda()
3496
kernel_backend(::CuArrayDeviceProc) = CUDADevice()
3597

3698
if CUDA.has_cuda()
3799
for dev in devices()
38-
Dagger.add_callback!(proc -> begin
39-
return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev.handle)
40-
end)
100+
Dagger.add_processor_callback!("cuarray_device_$(dev.handle)") do
101+
CuArrayDeviceProc(Distributed.myid(), dev.handle, CUDA.uuid(dev))
102+
end
41103
end
42104
end

src/roc.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ Dagger.execute!(proc::ROCArrayProc, func, args...) = func(args...)
1010

1111
processor(::Val{:ROC}) = ROCArrayProc
1212
cancompute(::Val{:ROC}) = AMDGPU.configured
13-
# FIXME: kernel_backend(::ROCDevice) = ROCArrayProc
13+
kernel_backend(::ROCDevice) = ROCArrayProc
1414

1515
if AMDGPU.configured
16-
Dagger.add_callback!(proc -> begin
17-
return ROCArrayProc(AMDGPU.get_default_agent())
18-
end)
16+
Dagger.add_processor_callback!("rocarray_device_default") do
17+
ROCArrayProc(AMDGPU.get_default_agent())
18+
end
1919
end

0 commit comments

Comments
 (0)