JuliaGPU
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Manifest.toml‎
Lines changed: 0 additions & 128 deletions b/‎Manifest.toml‎
Lines changed: 0 additions & 128 deletions
diff --git a/‎Project.toml‎
Lines changed: 8 additions & 6 deletions b/‎Project.toml‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/DaggerGPU.jl‎
Lines changed: 15 additions & 12 deletions b/‎src/DaggerGPU.jl‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎src/cu.jl‎
Lines changed: 65 additions & 0 deletions b/‎src/cu.jl‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/cuarrays.jl‎
Lines changed: 0 additions & 17 deletions b/‎src/cuarrays.jl‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎src/roc.jl‎
Lines changed: 18 additions & 0 deletions b/‎src/roc.jl‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/rocarrays.jl‎
Lines changed: 0 additions & 17 deletions b/‎src/rocarrays.jl‎
Lines changed: 0 additions & 17 deletions
@@ -0,0 +1 @@
+Manifest.toml
@@ -1,22 +1,24 @@
 name = "DaggerGPU"
 uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
-version = "0.1.0"
+version = "0.1.1"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
+Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]
-Dagger = "0.8"
-Requires = "1.0"
+Dagger = "0.9"
+Requires = "1"
 julia = "1"
 
 [extras]
-CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-ROCArrays = "ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["CuArrays", "Distributed", "ROCArrays", "Test"]
+test = ["AMDGPU", "CUDA", "Distributed", "Test"]
@@ -2,10 +2,10 @@
 
 **GPU integrations for Dagger.jl**
 
-DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CuArrays.jl and ROCArrays.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CuArrays.jl/ROCArrays.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
+DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CUDA.jl and AMDGPU.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
 
 DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide your:
 - The complete error message and backtrace
 - Julia version
 - GPU vendor and model
-- CuArrays/ROCArrays version(s)
+- CUDA/AMDGPU version(s)
@@ -1,17 +1,20 @@
 module DaggerGPU
 
-using Dagger, Requires
+using Dagger, Requires, Adapt
+using Distributed
 
 macro gpuproc(PROC, T)
     quote
-Dagger.iscompatible(proc::$PROC, opts, x::AbstractArray{AT}) where AT =
-    isbitstype(AT)
-Dagger.move(ctx, from_proc::OSProc, to_proc::$PROC, x::AbstractArray) =
-    $T(x)
-Dagger.move(ctx, from_proc::$PROC, to_proc::OSProc, x) = x
-Dagger.move(ctx, from_proc::$PROC, to_proc::OSProc, x::$T) =
-    collect(x)
-Dagger.execute!(proc::$PROC, func, args...) = func(args...)
+        # Assume that we can run anything
+        Dagger.iscompatible_func(proc::$PROC, opts, f) = true
+        Dagger.iscompatible_arg(proc::$PROC, opts, x) = true
+
+        # CPUs shouldn't process our array type
+        Dagger.iscompatible_arg(proc::Dagger.ThreadProc, opts, x::$T) = false
+
+        # Adapt to/from the appropriate type
+        Dagger.move(ctx, from_proc::OSProc, to_proc::$PROC, x) = adapt($T, x)
+        Dagger.move(ctx, from_proc::$PROC, to_proc::OSProc, x) = adapt(Array, x)
     end
 end
 
@@ -21,11 +24,11 @@ cancompute(kind::Symbol) = cancompute(Val(kind))
 cancompute(::Val) = false
 
 function __init__()
-    @require CuArrays="3a865a2d-5b23-5a0f-bc46-62713ec82fae" begin
-        include("cuarrays.jl")
+    @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
+        include("cu.jl")
     end
     @require ROCArrays="ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc" begin
-        include("rocarrays.jl")
+        include("roc.jl")
     end
 end
 
 
@@ -0,0 +1,65 @@
+using .CUDA
+import .CUDA: CuDevice, CuContext, devices, attribute
+
+export CuArrayProc, CuArrayDeviceProc, CuArraySMProc
+
+"Represents a single CUDA GPU device."
+struct CuArrayDeviceProc <: Dagger.Processor
+    owner::Int
+    #ctx::CuContext
+    device::CuDevice
+end
+@gpuproc(CuArrayDeviceProc, CuArray)
+const CuArrayProc = CuArrayDeviceProc
+#= FIXME: CUDA IPC
+function Dagger.move(ctx, from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
+    if from === to
+        return x
+    else
+        error("Not implemented")
+    end
+end
+=#
+function Dagger.execute!(proc::CuArrayDeviceProc, func, args...)
+    #CUDA.context!(proc.ctx)
+    CUDA.@sync func(args...)
+end
+
+"Represents a single CUDA GPU Streaming Multiprocessor."
+struct CuArraySMProc <: Dagger.Processor
+    owner::Int
+    #ctx::CuContext
+    device::CuDevice
+    sm::Int
+end
+@gpuproc(CuArraySMProc, CuArray)
+#= FIXME: CUDA IPC
+function Dagger.move(ctx, from::CuArraySMProc, to::CuArraySMProc, x)
+    if from.device === to.device
+        return x
+    else
+        error("Not implemented")
+    end
+end
+=#
+function Dagger.execute!(proc::CuArraySMProc, func, args...)
+    #CUDA.context!(proc.ctx)
+    CUDA.@sync func(args...)
+end
+
+processor(::Val{:CUDA}) = CuArrayDeviceProc
+cancompute(::Val{:CUDA}) = CUDA.has_cuda()
+# TODO: CuArraySMProc
+
+if CUDA.has_cuda()
+    for dev in devices()
+        Dagger.add_callback!(proc -> begin
+            return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev)
+        end)
+        for i in 1:attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
+            Dagger.add_callback!(proc -> begin
+                return CuArraySMProc(Distributed.myid(), #=CuContext(dev),=# dev, i)
+            end)
+        end
+    end
+end
@@ -0,0 +1,18 @@
+using .AMDGPU
+
+struct ROCArrayProc <: Dagger.Processor
+    device
+end
+
+@gpuproc(ROCArrayProc, ROCArray)
+
+Dagger.execute!(proc::ROCArrayProc, func, args...) = func(args...)
+
+processor(::Val{:ROC}) = ROCArrayProc
+cancompute(::Val{:ROC}) = AMDGPU.configured
+
+if AMDGPU.configured
+    Dagger.add_callback!(proc -> begin
+        return ROCArrayProc(AMDGPU.get_default_agent())
+    end)
+end