Initial support for Metal GPUs (#19)

ronisbr · web-flow · commit c8941dbfbe9e · 2022-12-12T12:48:58.000-06:00
Fix CPU tests
diff --git a/Project.toml b/Project.toml
@@ -24,6 +24,7 @@ julia = "1"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 **GPU integrations for Dagger.jl**
 
-DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CUDA.jl and AMDGPU.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayDeviceProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
+DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA, AMD, and Apple GPUs, via CUDA.jl, AMDGPU.jl, and Metal.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl/Metal.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayDeviceProc`/`DaggerGPU.ROCArrayProc`/`DaggerGPU.MtlArrayDeviceProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
 
 DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide:
 - The complete error message and backtrace
diff --git a/src/DaggerGPU.jl b/src/DaggerGPU.jl
@@ -56,6 +56,9 @@ function __init__()
     @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin
         include("roc.jl")
     end
+    @require Metal="dde4c033-4e86-420c-a63e-0dd931031962" begin
+        include("metal.jl")
+    end
 end
 
 end
diff --git a/src/metal.jl b/src/metal.jl
@@ -0,0 +1,44 @@
+using .Metal
+import .Metal: MtlArray, MtlDevice
+
+struct MtlArrayDeviceProc <: Dagger.Processor
+    owner::Int
+    device::MtlDevice
+end
+
+@gpuproc(MtlArrayDeviceProc, MtlArray)
+Dagger.get_parent(proc::MtlArrayDeviceProc) = Dagger.OSProc(proc.owner)
+
+function Dagger.execute!(proc::MtlArrayDeviceProc, func, args...)
+    tls = Dagger.get_tls()
+    task = Threads.@spawn begin
+        Dagger.set_tls!(tls)
+        Metal.@sync func(args...)
+    end
+
+    try
+        fetch(task)
+    catch err
+        @static if VERSION >= v"1.1"
+            stk = Base.catch_stack(task)
+            err, frames = stk[1]
+            rethrow(CapturedException(err, frames))
+        else
+            rethrow(task.result)
+        end
+    end
+end
+
+function Base.show(io::IO, proc::MtlArrayDeviceProc)
+    print(io, "MtlArrayDeviceProc on worker $(proc.owner), device ($(proc.device.name))")
+end
+
+processor(::Val{:Metal}) = MtlArrayDeviceProc
+cancompute(::Val{:Metal}) = length(Metal.devices()) >= 1
+kernel_backend(::MtlArrayDeviceProc) = Metal.current_device()
+
+if length(Metal.devices()) >= 1
+    Dagger.add_processor_callback!("metal_device") do
+        MtlArrayDeviceProc(Distributed.myid(), Metal.current_device())
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,8 +3,8 @@ using Test
 addprocs(2, exeflags="--project")
 
 @everywhere begin
+    using CUDA, AMDGPU, Metal, KernelAbstractions
     using Distributed, Dagger, DaggerGPU
-    using CUDA, AMDGPU, KernelAbstractions
 end
 @everywhere begin
     function myfunc(X)
@@ -29,13 +29,15 @@ function generate_thunks()
     delayed((xs...)->[sum(xs)])(as...)
 end
 
-@test DaggerGPU.cancompute(:CUDA) || DaggerGPU.cancompute(:ROC)
+@test DaggerGPU.cancompute(:CUDA) ||
+      DaggerGPU.cancompute(:ROC)  ||
+      DaggerGPU.cancompute(:Metal)
 
 @testset "CPU" begin
     @testset "KernelAbstractions" begin
         A = rand(Float32, 8)
-        _A = collect(delayed(fill_thunk)(A, 2.3))
-        @test all(_A .== 2.3)
+        _A = collect(delayed(fill_thunk)(A, 2.3f0))
+        @test all(_A .== 2.3f0)
     end
 end
 
@@ -89,3 +91,22 @@ end
         =#
     end
 end
+
+@testset "Metal" begin
+    if !DaggerGPU.cancompute(:Metal)
+        @warn "No Metal devices available, skipping tests"
+    else
+        metalproc = DaggerGPU.processor(:Metal)
+        b = generate_thunks()
+        opts = Dagger.Sch.ThunkOptions(;proclist = [metalproc])
+        c_pre = delayed(myfunc; options = opts)(b)
+        c = delayed(sum; options = opts)(b)
+
+        opts = Dagger.Sch.ThunkOptions(;proclist = [Dagger.ThreadProc])
+        d = delayed(identity; options = opts)(c)
+        @test collect(d) == 20
+
+        # It seems KernelAbstractions does not support Metal.jl.
+        @test_skip "KernelAbstractions"
+    end
+end