Skip to content

Commit 8f9d530

Browse files
authored
Merge pull request #33 from JuliaGPU/jps/fix-multigpu-move
Fix multi-GPU data movement
2 parents 319a71e + 10f3c85 commit 8f9d530

3 files changed

Lines changed: 64 additions & 28 deletions

File tree

Project.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ MetalExt = "Metal"
2626
ROCExt = "AMDGPU"
2727

2828
[compat]
29-
AMDGPU = "0.4"
30-
Adapt = "1, 2, 3"
31-
CUDA = "3, 4"
29+
AMDGPU = "0.8.1"
30+
Adapt = "1, 2, 3, 4"
31+
CUDA = "3, 4, 5"
3232
Dagger = "0.17, 0.18"
3333
KernelAbstractions = "0.9"
3434
MemPool = "0.3, 0.4"
35-
Metal = "0.3, 0.4"
35+
Metal = "0.3, 0.4, 0.5"
3636
Requires = "1"
3737
julia = "1.7"

ext/CUDAExt.jl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,24 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.C
8080
end
8181
end
8282

83+
function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray)
84+
# TODO: No extra allocations here
85+
if CUDA.device(x) == collect(CUDA.devices())[to_proc.device+1]
86+
return x
87+
end
88+
DaggerGPU.with_device(to_proc) do
89+
_x = similar(x)
90+
copyto!(_x, x)
91+
return _x
92+
end
93+
end
94+
95+
function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{T,N}) where {T,N}
96+
_x = Array{T,N}(undef, size(x))
97+
copyto!(_x, x)
98+
return _x
99+
end
100+
83101
function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
84102
@nospecialize f args kwargs
85103
tls = Dagger.get_tls()

test/runtests.jl

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -79,28 +79,37 @@ end
7979
CuArrayDeviceProc
8080
end
8181
@test DaggerGPU.processor(:CUDA) === cuproc
82-
b = generate_thunks()
83-
c = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=1)) do
84-
@test fetch(Dagger.@spawn isongpu(b))
85-
Dagger.@spawn sum(b)
82+
ndevices = length(collect(CUDA.devices()))
83+
84+
@testset "Arrays (GPU $gpu)" for gpu in 1:min(ndevices, 2)
85+
b = generate_thunks()
86+
c = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=gpu)) do
87+
@test fetch(Dagger.@spawn isongpu(b))
88+
Dagger.@spawn sum(b)
89+
end
90+
@test !fetch(Dagger.@spawn isongpu(b))
91+
@test fetch(Dagger.@spawn identity(c)) == 20
8692
end
87-
@test !fetch(Dagger.@spawn isongpu(b))
88-
@test fetch(Dagger.@spawn identity(c)) == 20
8993

90-
@testset "KernelAbstractions" begin
94+
@testset "KernelAbstractions (GPU $gpu)" for gpu in 1:min(ndevices, 2)
9195
A = rand(Float32, 8)
92-
DA, T = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=1)) do
96+
DA, T = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=gpu)) do
9397
fetch(Dagger.@spawn fill_thunk(A, 2.3f0))
9498
end
9599
@test all(DA .== 2.3f0)
96100
@test T <: CuArray
97101

98-
A = CUDA.rand(128)
99-
B = CUDA.zeros(128)
100-
Dagger.with_options(;scope=Dagger.scope(worker=1,cuda_gpu=1)) do
102+
local A, B
103+
CUDA.device!(gpu-1) do
104+
A = CUDA.rand(128)
105+
B = CUDA.zeros(128)
106+
end
107+
Dagger.with_options(;scope=Dagger.scope(worker=1,cuda_gpu=gpu)) do
101108
fetch(Dagger.@spawn Kernel(copy_kernel)(B, A; ndrange=length(A)))
102109
end
103-
@test all(B .== A)
110+
CUDA.device!(gpu-1) do
111+
@test all(B .== A)
112+
end
104113
end
105114
end
106115
end
@@ -115,28 +124,37 @@ end
115124
ROCArrayDeviceProc
116125
end
117126
@test DaggerGPU.processor(:ROC) === rocproc
118-
b = generate_thunks()
119-
c = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=1)) do
120-
@test fetch(Dagger.@spawn isongpu(b))
121-
Dagger.@spawn sum(b)
127+
ndevices = length(AMDGPU.devices())
128+
129+
@testset "Arrays (GPU $gpu)" for gpu in 1:min(ndevices, 2)
130+
b = generate_thunks()
131+
c = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=gpu)) do
132+
@test fetch(Dagger.@spawn isongpu(b))
133+
Dagger.@spawn sum(b)
134+
end
135+
@test !fetch(Dagger.@spawn isongpu(b))
136+
@test fetch(Dagger.@spawn identity(c)) == 20
122137
end
123-
@test !fetch(Dagger.@spawn isongpu(b))
124-
@test fetch(Dagger.@spawn identity(c)) == 20
125138

126-
@testset "KernelAbstractions" begin
139+
@testset "KernelAbstractions (GPU $gpu)" for gpu in 1:min(ndevices, 2)
127140
A = rand(Float32, 8)
128-
DA, T = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=1)) do
141+
DA, T = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=gpu)) do
129142
fetch(Dagger.@spawn fill_thunk(A, 2.3f0))
130143
end
131144
@test all(DA .== 2.3f0)
132145
@test T <: ROCArray
133146

134-
A = AMDGPU.rand(128)
135-
B = AMDGPU.zeros(128)
136-
Dagger.with_options(;scope=Dagger.scope(worker=1,rocm_gpu=1)) do
147+
local A, B
148+
AMDGPU.device!(AMDGPU.devices()[gpu]) do
149+
A = AMDGPU.rand(128)
150+
B = AMDGPU.zeros(128)
151+
end
152+
Dagger.with_options(;scope=Dagger.scope(worker=1,rocm_gpu=gpu)) do
137153
fetch(Dagger.@spawn Kernel(copy_kernel)(B, A; ndrange=length(A)))
138154
end
139-
@test all(B .== A)
155+
AMDGPU.device!(AMDGPU.devices()[gpu]) do
156+
@test all(B .== A)
157+
end
140158
end
141159
end
142160
end

0 commit comments

Comments
 (0)