@@ -891,6 +891,8 @@ const HDR_COOKIE_LEN = 16
891891const map_pid_wrkr = Lockable (Dict {Int, Union{Worker, LocalProcess}} ())
892892const map_sock_wrkr = Lockable (IdDict ())
893893const map_del_wrkr = Lockable (Set {Int} ())
894+ const _exited_callback_pid = ScopedValue {Int} (- 1 )
895+ const map_pid_statuses = Lockable (Dict {Int, Any} ())
894896const worker_starting_callbacks = Dict {Any, Base.Callable} ()
895897const worker_started_callbacks = Dict {Any, Base.Callable} ()
896898const worker_exiting_callbacks = Dict {Any, Base.Callable} ()
@@ -1042,9 +1044,9 @@ segfaulting etc). Chooses and returns a unique key for the callback if `key` is
10421044not specified.
10431045
10441046The callback will be called with the worker ID and the final
1045- `Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an
1046- enum, a value of `WorkerState_terminated` means a graceful exit and a value of
1047- `WorkerState_exterminated` means the worker died unexpectedly.
1047+ `Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is
1048+ an enum, a value of `WorkerState_terminated` means a graceful exit and a value
1049+ of `WorkerState_exterminated` means the worker died unexpectedly.
10481050
10491051All worker-exited callbacks will be executed concurrently. If a callback throws
10501052an exception it will be caught and printed.
@@ -1238,6 +1240,112 @@ Identical to [`workers()`](@ref) except that the current worker is filtered out.
12381240"""
12391241other_workers () = filter (!= (myid ()), workers ())
12401242
1243+ """
1244+ @setstatus! x
1245+ @setstatus! x pid
1246+
1247+ Set the status for the calling module on worker `pid` (defaults to the current
1248+ worker) to `x`. `x` may be any serializable object but it's recommended to keep
1249+ it small enough to cheaply send over a network. Statuses can be retrieved inside
1250+ worker-exited callbacks (see [`add_worker_exited_callback`](@ref)) before the
1251+ worker is fully deregistered.
1252+
1253+ Statuses are keyed by the calling `Module`, so multiple libraries can
1254+ independently track their own status on the same worker without conflicting.
1255+
1256+ This can be handy if you want a way to know what a worker is doing at any given
1257+ time, or (in combination with a worker-exited callback) for knowing what a
1258+ worker was last doing before it died.
1259+
1260+ # Examples
1261+ ```julia-repl
1262+ julia> DistributedNext.@setstatus! "working on dataset 42"
1263+ "working on dataset 42"
1264+
1265+ julia> DistributedNext.@getstatus
1266+ "working on dataset 42"
1267+ ```
1268+
1269+ See also [`setstatus!`](@ref) for the function form that accepts an explicit module key.
1270+ """
1271+ macro setstatus! (x)
1272+ mod = __module__
1273+ :(setstatus! ($ (esc (x)), $ mod))
1274+ end
1275+
1276+ macro setstatus! (x, pid)
1277+ mod = __module__
1278+ :(setstatus! ($ (esc (x)), $ mod, $ (esc (pid))))
1279+ end
1280+
1281+ """
1282+ setstatus!(x, mod::Module, pid::Int=myid())
1283+
1284+ Function form of [`@setstatus!`](@ref). Sets the status for module `mod` on
1285+ worker `pid` to `x`.
1286+ """
1287+ function setstatus! (x, mod:: Module , pid:: Int = myid ())
1288+ if ! id_in_procs (pid)
1289+ throw (ArgumentError (" Worker $(pid) does not exist, cannot set its status" ))
1290+ end
1291+
1292+ if myid () == 1
1293+ @lock map_pid_statuses begin
1294+ statuses = get! (map_pid_statuses[], pid, Dict {Module, Any} ())
1295+ statuses[mod] = x
1296+ end
1297+ else
1298+ remotecall_fetch (setstatus!, 1 , x, mod, myid ())
1299+ end
1300+ end
1301+
1302+ function _getstatus (pid, mod)
1303+ @lock map_pid_statuses begin
1304+ statuses = get (map_pid_statuses[], pid, nothing )
1305+ isnothing (statuses) ? nothing : get (statuses, mod, nothing )
1306+ end
1307+ end
1308+
1309+ """
1310+ @getstatus
1311+ @getstatus pid
1312+
1313+ Get the status set by the calling module for worker `pid` (defaults to the
1314+ current worker). If one was never explicitly set with [`@setstatus!`](@ref)
1315+ this will return `nothing`.
1316+
1317+ See also [`getstatus`](@ref) for the function form.
1318+ """
1319+ macro getstatus ()
1320+ mod = __module__
1321+ :(getstatus ($ mod))
1322+ end
1323+ macro getstatus (pid)
1324+ mod = __module__
1325+ :(getstatus ($ mod, $ (esc (pid))))
1326+ end
1327+
1328+ """
1329+ getstatus(mod::Module, pid::Int=myid())
1330+
1331+ Function form of [`@getstatus`](@ref). Gets the status for module `mod` on
1332+ worker `pid`. Returns `nothing` if no status was set.
1333+ """
1334+ function getstatus (mod:: Module , pid:: Int = myid ())
1335+ # During the worker-exited callbacks this function may be called, at which
1336+ # point it will not exist in procs(). Thus we check whether the function is
1337+ # being called for an exited worker and allow it if so.
1338+ if ! id_in_procs (pid) && _exited_callback_pid[] != pid
1339+ throw (ArgumentError (" Worker $(pid) does not exist, cannot get its status" ))
1340+ end
1341+
1342+ if myid () == 1
1343+ _getstatus (pid, mod)
1344+ else
1345+ remotecall_fetch (getstatus, 1 , mod, pid)
1346+ end
1347+ end
1348+
12411349function cluster_mgmt_from_master_check ()
12421350 if myid () != 1
12431351 throw (ErrorException (" Only process 1 can add and remove workers" ))
@@ -1463,15 +1571,22 @@ function deregister_worker(pg, pid)
14631571 end
14641572 end
14651573
1466- # Call callbacks on the master
14671574 if myid () == 1
1468- for (name, callback) in worker_exited_callbacks
1469- try
1470- callback (pid, w. state)
1471- catch ex
1472- @error " Error when running worker-exited callback '$(name) '" exception= (ex, catch_backtrace ())
1473- end
1575+ params = default_addprocs_params (w. manager)
1576+ warning_interval = params[:callback_warning_interval ]
1577+
1578+ # Call callbacks on the master, with the scoped value set so that
1579+ # getstatus() can be called for the exiting worker without failing the
1580+ # pid check. We go to some effort to make sure this works after
1581+ # deregistering the worker because if it's called beforehand the worker
1582+ # will incorrectly be shown in e.g. procs().
1583+ @with _exited_callback_pid => pid begin
1584+ _run_callbacks_concurrently (" worker-exited" , worker_exited_callbacks,
1585+ warning_interval, [(pid, w. state)]; catch_exceptions= true )
14741586 end
1587+
1588+ # Delete its statuses
1589+ @lock map_pid_statuses delete! (map_pid_statuses[], pid)
14751590 end
14761591
14771592 return
0 commit comments