Merge pull request #17 from JuliaGPU/jps/cuda-ipc

jpsamaroo · web-flow · commit 1723b03a1cb7 · 2021-11-30T10:36:31.000-06:00
Optimize CuArrayDeviceProc with IPC and DtoD
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
diff --git a/Project.toml b/Project.toml
@@ -1,19 +1,21 @@
 name = "DaggerGPU"
 uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
-version = "0.1.2"
+version = "0.1.3"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
 Adapt = "1, 2, 3"
-Dagger = "0.10, 0.11"
-KernelAbstractions = "0.4"
+Dagger = "0.13.3"
+KernelAbstractions = "0.5, 0.6, 0.7"
 Requires = "1"
 julia = "1"
 
diff --git a/src/DaggerGPU.jl b/src/DaggerGPU.jl
@@ -1,9 +1,11 @@
 module DaggerGPU
 
-using Dagger, Requires, Adapt
+using Dagger, MemPool, Requires, Adapt
 using Distributed
 using KernelAbstractions
 
+import Dagger: Chunk
+
 macro gpuproc(PROC, T)
     quote
         # Assume that we can run anything
@@ -14,8 +16,28 @@ macro gpuproc(PROC, T)
         Dagger.iscompatible_arg(proc::Dagger.ThreadProc, opts, x::$T) = false
 
         # Adapt to/from the appropriate type
-        Dagger.move(from_proc::OSProc, to_proc::$PROC, x) = adapt($T, x)
-        Dagger.move(from_proc::$PROC, to_proc::OSProc, x) = adapt(Array, x)
+        function Dagger.move(from_proc::OSProc, to_proc::$PROC, x::Chunk)
+            from_pid = from_proc.pid
+            to_pid = Dagger.get_parent(to_proc).pid
+            @assert myid() == to_pid
+            adapt($T, remotecall_fetch(from_pid, x) do x
+                poolget(x.handle)
+            end)
+        end
+        function Dagger.move(from_proc::$PROC, to_proc::OSProc, x::Chunk)
+            from_pid = Dagger.get_parent(from_proc).pid
+            to_pid = to_proc.pid
+            @assert myid() == to_pid
+            remotecall_fetch(from_pid, x) do x
+                adapt(Array, poolget(x.handle))
+            end
+        end
+        function Dagger.move(from_proc::OSProc, to_proc::$PROC, x)
+            adapt($T, x)
+        end
+        function Dagger.move(from_proc::$PROC, to_proc::OSProc, x)
+            adapt(Array, x)
+        end
     end
 end
 
@@ -31,7 +53,7 @@ function __init__()
     @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
         include("cu.jl")
     end
-    @require ROCArrays="ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc" begin
+    @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin
         include("roc.jl")
     end
 end
diff --git a/src/cu.jl b/src/cu.jl
@@ -1,42 +1,104 @@
 using .CUDA
 import .CUDA: CuDevice, CuContext, devices, attribute
 
+using UUIDs
+
 export CuArrayDeviceProc
 
 "Represents a single CUDA GPU device."
 struct CuArrayDeviceProc <: Dagger.Processor
     owner::Int
-    #ctx::CuContext
     device::Int
+    device_uuid::UUID
 end
 @gpuproc(CuArrayDeviceProc, CuArray)
-#= FIXME: DtoD copies and CUDA IPC
-function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
-    if from === to
-        return x
+Dagger.get_parent(proc::CuArrayDeviceProc) = Dagger.OSProc(proc.owner)
+
+# function can_access(this, peer)
+#     status = Ref{Cint}()
+#     CUDA.cuDeviceCanAccessPeer(status, this, peer)
+#     return status[] == 1
+# end
+
+function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.Chunk{T}) where T<:CuArray
+    if from == to
+        # Same process and GPU, no change
+        poolget(x.handle)
+    elseif from.owner == to.owner
+        # Same process but different GPUs, use DtoD copy
+        from_arr = poolget(x.handle)
+        to_arr = CUDA.device!(to.device) do
+            CuArray{T,N}(undef, size)
+        end
+        copyto!(to_arr, from_arr)
+        to_arr
+    elseif Dagger.system_uuid(from.owner) == Dagger.system_uuid(to.owner)
+        # Same node, we can use IPC
+        ipc_handle, eT, shape = remotecall_fetch(from.owner, x.handle) do h
+            arr = poolget(h)
+            ipc_handle_ref = Ref{CUDA.CUipcMemHandle}()
+            GC.@preserve arr begin
+                CUDA.cuIpcGetMemHandle(ipc_handle_ref, pointer(arr))
+            end
+            (ipc_handle_ref[], eltype(arr), size(arr))
+        end
+        r_ptr = Ref{CUDA.CUdeviceptr}()
+        CUDA.device!(from.device) do # FIXME: Assumes that device IDs are identical across processes
+            CUDA.cuIpcOpenMemHandle(r_ptr, ipc_handle, CUDA.CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
+        end
+        ptr = Base.unsafe_convert(CUDA.CuPtr{eT}, r_ptr[])
+        arr = unsafe_wrap(CuArray, ptr, shape; own=false)
+        finalizer(arr) do arr
+            CUDA.cuIpcCloseMemHandle(pointer(arr))
+        end
+        if from.device_uuid != to.device_uuid
+            CUDA.device!(to.device) do
+                to_arr = similar(arr)
+                copyto!(to_arr, arr)
+                to_arr
+            end
+        else
+            arr
+        end
     else
-        error("Not implemented")
+        # Different node, use DtoH, serialization, HtoD
+        # TODO UCX
+        CuArray(remotecall_fetch(from.owner, x.handle) do h
+            Array(poolget(h))
+        end)
     end
 end
-=#
+
 function Dagger.execute!(proc::CuArrayDeviceProc, func, args...)
-    fetch(Threads.@spawn begin
-        task_local_storage(:processor, proc)
+    tls = Dagger.get_tls()
+    task = Threads.@spawn begin
+        Dagger.set_tls!(tls)
         CUDA.device!(proc.device)
         CUDA.@sync func(args...)
-    end)
+    end
+    try
+        fetch(task)
+    catch err
+        @static if VERSION >= v"1.1"
+            stk = Base.catch_stack(task)
+            err, frames = stk[1]
+            rethrow(CapturedException(err, frames))
+        else
+            rethrow(task.result)
+        end
+    end
 end
 Base.show(io::IO, proc::CuArrayDeviceProc) =
-    print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device)")
+    print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device), uuid $(proc.device_uuid)")
 
 processor(::Val{:CUDA}) = CuArrayDeviceProc
 cancompute(::Val{:CUDA}) = CUDA.has_cuda()
 kernel_backend(::CuArrayDeviceProc) = CUDADevice()
 
 if CUDA.has_cuda()
     for dev in devices()
-        Dagger.add_callback!(proc -> begin
-            return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev.handle)
-        end)
+        Dagger.add_processor_callback!("cuarray_device_$(dev.handle)") do
+            CuArrayDeviceProc(Distributed.myid(), dev.handle, CUDA.uuid(dev))
+        end
     end
 end
diff --git a/src/roc.jl b/src/roc.jl
@@ -10,10 +10,10 @@ Dagger.execute!(proc::ROCArrayProc, func, args...) = func(args...)
 
 processor(::Val{:ROC}) = ROCArrayProc
 cancompute(::Val{:ROC}) = AMDGPU.configured
-# FIXME: kernel_backend(::ROCDevice) = ROCArrayProc
+kernel_backend(::ROCDevice) = ROCArrayProc
 
 if AMDGPU.configured
-    Dagger.add_callback!(proc -> begin
-        return ROCArrayProc(AMDGPU.get_default_agent())
-    end)
+    Dagger.add_processor_callback!("rocarray_device_default") do
+        ROCArrayProc(AMDGPU.get_default_agent())
+    end
 end