Merge pull request #33 from JuliaGPU/jps/fix-multigpu-move

jpsamaroo · web-flow · commit 8f9d53043f30 · 2023-12-20T13:24:07.000-07:00
Fix multi-GPU data movement
diff --git a/Project.toml b/Project.toml
@@ -26,12 +26,12 @@ MetalExt = "Metal"
 ROCExt = "AMDGPU"
 
 [compat]
-AMDGPU = "0.4"
-Adapt = "1, 2, 3"
-CUDA = "3, 4"
+AMDGPU = "0.8.1"
+Adapt = "1, 2, 3, 4"
+CUDA = "3, 4, 5"
 Dagger = "0.17, 0.18"
 KernelAbstractions = "0.9"
 MemPool = "0.3, 0.4"
-Metal = "0.3, 0.4"
+Metal = "0.3, 0.4, 0.5"
 Requires = "1"
 julia = "1.7"
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -80,6 +80,24 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.C
     end
 end
 
+function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray)
+    # TODO: No extra allocations here
+    if CUDA.device(x) == collect(CUDA.devices())[to_proc.device+1]
+        return x
+    end
+    DaggerGPU.with_device(to_proc) do
+        _x = similar(x)
+        copyto!(_x, x)
+        return _x
+    end
+end
+
+function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{T,N}) where {T,N}
+    _x = Array{T,N}(undef, size(x))
+    copyto!(_x, x)
+    return _x
+end
+
 function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
     @nospecialize f args kwargs
     tls = Dagger.get_tls()
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -79,28 +79,37 @@ end
             CuArrayDeviceProc
         end
         @test DaggerGPU.processor(:CUDA) === cuproc
-        b = generate_thunks()
-        c = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=1)) do
-            @test fetch(Dagger.@spawn isongpu(b))
-            Dagger.@spawn sum(b)
+        ndevices = length(collect(CUDA.devices()))
+
+        @testset "Arrays (GPU $gpu)" for gpu in 1:min(ndevices, 2)
+            b = generate_thunks()
+            c = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=gpu)) do
+                @test fetch(Dagger.@spawn isongpu(b))
+                Dagger.@spawn sum(b)
+            end
+            @test !fetch(Dagger.@spawn isongpu(b))
+            @test fetch(Dagger.@spawn identity(c)) == 20
         end
-        @test !fetch(Dagger.@spawn isongpu(b))
-        @test fetch(Dagger.@spawn identity(c)) == 20
 
-        @testset "KernelAbstractions" begin
+        @testset "KernelAbstractions (GPU $gpu)" for gpu in 1:min(ndevices, 2)
             A = rand(Float32, 8)
-            DA, T = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=1)) do
+            DA, T = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=gpu)) do
                 fetch(Dagger.@spawn fill_thunk(A, 2.3f0))
             end
             @test all(DA .== 2.3f0)
             @test T <: CuArray
 
-            A = CUDA.rand(128)
-            B = CUDA.zeros(128)
-            Dagger.with_options(;scope=Dagger.scope(worker=1,cuda_gpu=1)) do
+            local A, B
+            CUDA.device!(gpu-1) do
+                A = CUDA.rand(128)
+                B = CUDA.zeros(128)
+            end
+            Dagger.with_options(;scope=Dagger.scope(worker=1,cuda_gpu=gpu)) do
                 fetch(Dagger.@spawn Kernel(copy_kernel)(B, A; ndrange=length(A)))
             end
-            @test all(B .== A)
+            CUDA.device!(gpu-1) do
+                @test all(B .== A)
+            end
         end
     end
 end
@@ -115,28 +124,37 @@ end
             ROCArrayDeviceProc
         end
         @test DaggerGPU.processor(:ROC) === rocproc
-        b = generate_thunks()
-        c = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=1)) do
-            @test fetch(Dagger.@spawn isongpu(b))
-            Dagger.@spawn sum(b)
+        ndevices = length(AMDGPU.devices())
+
+        @testset "Arrays (GPU $gpu)" for gpu in 1:min(ndevices, 2)
+            b = generate_thunks()
+            c = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=gpu)) do
+                @test fetch(Dagger.@spawn isongpu(b))
+                Dagger.@spawn sum(b)
+            end
+            @test !fetch(Dagger.@spawn isongpu(b))
+            @test fetch(Dagger.@spawn identity(c)) == 20
         end
-        @test !fetch(Dagger.@spawn isongpu(b))
-        @test fetch(Dagger.@spawn identity(c)) == 20
 
-        @testset "KernelAbstractions" begin
+        @testset "KernelAbstractions (GPU $gpu)" for gpu in 1:min(ndevices, 2)
             A = rand(Float32, 8)
-            DA, T = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=1)) do
+            DA, T = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=gpu)) do
                 fetch(Dagger.@spawn fill_thunk(A, 2.3f0))
             end
             @test all(DA .== 2.3f0)
             @test T <: ROCArray
 
-            A = AMDGPU.rand(128)
-            B = AMDGPU.zeros(128)
-            Dagger.with_options(;scope=Dagger.scope(worker=1,rocm_gpu=1)) do
+            local A, B
+            AMDGPU.device!(AMDGPU.devices()[gpu]) do
+                A = AMDGPU.rand(128)
+                B = AMDGPU.zeros(128)
+            end
+            Dagger.with_options(;scope=Dagger.scope(worker=1,rocm_gpu=gpu)) do
                 fetch(Dagger.@spawn Kernel(copy_kernel)(B, A; ndrange=length(A)))
             end
-            @test all(B .== A)
+            AMDGPU.device!(AMDGPU.devices()[gpu]) do
+                @test all(B .== A)
+            end
         end
     end
 end