@@ -16,6 +16,10 @@ The default implementations of the above (on a `AbstractWorkerPool`) require fie
1616 - `channel::Channel{Int}`
1717 - `workers::Set{Int}`
1818where `channel` contains free worker pids and `workers` is the set of all workers associated with this pool.
19+
20+ The default implementations of the above handle dead workers by removing them
21+ from the pool. Be aware that since workers could die at any time, depending on
22+ the results of functions like `length` or `isready` is not thread-safe.
1923"""
2024abstract type AbstractWorkerPool end
2125
@@ -71,7 +75,43 @@ deserialize(S::AbstractSerializer, t::Type{T}) where {T<:WorkerPool} = T(deseria
7175
7276wp_local_push! (pool:: AbstractWorkerPool , w:: Int ) = (push! (pool. workers, w); put! (pool. channel, w); pool)
7377wp_local_length (pool:: AbstractWorkerPool ) = length (pool. workers)
74- wp_local_isready (pool:: AbstractWorkerPool ) = isready (pool. channel)
78+
79+ function check_valid_worker! (pool:: AbstractWorkerPool , worker)
80+ if ! id_in_procs (worker)
81+ # We abuse the Channel lock to provide thread-safety when we modify the
82+ # worker set.
83+ @lock pool. channel delete! (pool. workers, worker)
84+ return false
85+ else
86+ return true
87+ end
88+ end
89+
90+ function default_and_empty (pool:: AbstractWorkerPool )
91+ length (pool) == 0 && pool === default_worker_pool ()
92+ end
93+
94+ function wp_local_isready (pool:: AbstractWorkerPool )
95+ if default_and_empty (pool)
96+ # This state wouldn't block take!() so we return true
97+ return true
98+ end
99+
100+ # Otherwise we lock the channel to prevent anyone else from touching it and
101+ # take!() until we either run out of workers or get a valid one. Locking is
102+ # necessary to avoid blocking on take!() or fetch().
103+ @lock pool. channel begin
104+ while isready (pool. channel)
105+ worker = take! (pool. channel)
106+ if check_valid_worker! (pool, worker)
107+ put! (pool. channel, worker)
108+ break
109+ end
110+ end
111+
112+ return isready (pool. channel)
113+ end
114+ end
75115
76116function wp_local_put! (pool:: AbstractWorkerPool , w:: Int )
77117 # In case of default_worker_pool, the master is implicitly considered a worker, i.e.,
@@ -101,29 +141,39 @@ function wp_local_take!(pool::AbstractWorkerPool)
101141 # Find an active worker
102142 worker = 0
103143 while true
104- if length (pool) == 0
105- if pool === default_worker_pool ()
106- # No workers, the master process is used as a worker
107- worker = 1
108- break
109- else
110- throw (ErrorException (" No active worker available in pool" ))
111- end
144+ if default_and_empty (pool)
145+ # No workers, the master process is used as a worker
146+ worker = 1
147+ break
148+ elseif length (pool) == 0
149+ throw (ErrorException (" No active worker available in pool" ))
112150 end
113151
114152 worker = take! (pool. channel)
115- if id_in_procs ( worker)
153+ if check_valid_worker! (pool, worker)
116154 break
117- else
118- delete! (pool. workers, worker) # Remove invalid worker from pool
119155 end
120156 end
121157 return worker
122158end
123159
124160function wp_local_wait (pool:: AbstractWorkerPool )
125- wait (pool. channel)
126- return nothing
161+ if default_and_empty (pool)
162+ # This state wouldn't block take!() so we return
163+ return nothing
164+ end
165+
166+ while true
167+ # We don't use take!(::AbstractWorkerPool) because that will throw if
168+ # the pool is empty. This will wait forever until one becomes
169+ # available.
170+ worker = take! (pool. channel)
171+
172+ if check_valid_worker! (pool, worker)
173+ put! (pool. channel, worker)
174+ return nothing
175+ end
176+ end
127177end
128178
129179function remotecall_pool (rc_f, f, pool:: AbstractWorkerPool , args... ; kwargs... )
0 commit comments