Support MLFlow Handler for single process/multi task enviornment (#5728)

SachidanandAlle · web-flow · commit 97845067ac93 · 2022-12-15T11:11:03.000Z
Signed-off-by: Sachidanand Alle &lt;sachidanand.alle@gmail.com&gt;

Current MLFlow Handler fails when you invoke 2 train requests back to
back with different URI. Or multiple train requests within the same
process. This is mainly for using global array where it saves active
experiment, active run and others share the same. This will cause
conflicts between 2 invokes with 2 different URI.

Fixes
------
- Use MLFlow Client to create experiment/runs instead of global
functions.
- Save the current run through the lifecycle of handler. If any handler
has the same experiment name and same run name, the metrics all will be
merged as part of the same run (e.g. train and validation handler).
- If the run name is not provided (fall back on default) then last
active run within the same experiment (sorted based on start time) is
used for adding the metrics.

The above two conditions will help create similar behavior compared to
using `mlflow.active_run()`


Verified
---------
- Running single and multi gpu training on bundles
  - spleen_ct_segmentation_v0.1.0
  - spleen_deepedit_annotation_v0.1.0
  - swin_unetr_btcv_segmentation_v0.1.0
- Running Training workflows for both single and multi gpu in MONAI
Label
- Verified against running shared/single tracking URI (where all the
experiments get saved)
- Verified against individual eval/mlruns per bundle/workflow

&gt; I suggest, original owner of this handler to verify/test all the
behaviors that were currently supported.

Error Description
---------

Error stack when you run two train workflows with in the same process
(simply one after another).
```
[2022-12-13 21:08:11,095] [4047823] [MainThread] [ERROR] (uvicorn.error:369) - Exception in ASGI application
Traceback (most recent call last):
  File "/localhome/sachi/.local/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 366, in run_asgi
    result = await app(self.scope, self.receive, self.send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 75, in __call__
    return await self.app(scope, receive, send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/fastapi/applications.py", line 199, in __call__
    await super().__call__(scope, receive, send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 181, in __call__
    raise exc from None
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 159, in __call__
    await self.app(scope, receive, _send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 78, in __call__
    await self.app(scope, receive, send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/exceptions.py", line 82, in __call__
    raise exc from None
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/exceptions.py", line 71, in __call__
    await self.app(scope, receive, sender)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/routing.py", line 580, in __call__
    await route.handle(scope, receive, send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/routing.py", line 241, in handle
    await self.app(scope, receive, send)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/starlette/routing.py", line 52, in app
    response = await func(request)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/fastapi/routing.py", line 219, in app
    raw_response = await run_endpoint_function(
  File "/localhome/sachi/.local/lib/python3.10/site-packages/fastapi/routing.py", line 152, in run_endpoint_function
    return await dependant.call(**values)
  File "/localhome/sachi/Projects/monailabel/monailabel/endpoints/train.py", line 96, in api_run_model
    return run_model(model, params, run_sync, enqueue)
  File "/localhome/sachi/Projects/monailabel/monailabel/endpoints/train.py", line 55, in run_model
    res, detail = AsyncTask.run("train", request=request, params=params, force_sync=run_sync, enqueue=enqueue)
  File "/localhome/sachi/Projects/monailabel/monailabel/utils/async_tasks/task.py", line 43, in run
    return instance.train(request), None
  File "/localhome/sachi/Projects/monailabel/monailabel/interfaces/app.py", line 422, in train
    result = task(request, self.datastore())
  File "/localhome/sachi/Projects/monailabel/monailabel/tasks/train/basic_train.py", line 458, in __call__
    res = self.train(0, world_size, req, datalist)
  File "/localhome/sachi/Projects/monailabel/monailabel/tasks/train/basic_train.py", line 545, in train
    context.trainer.run()
  File "/localhome/sachi/Projects/MONAI/monai/engines/trainer.py", line 53, in run
    super().run()
  File "/localhome/sachi/Projects/MONAI/monai/engines/workflow.py", line 281, in run
    super().run(data=self.data_loader, max_epochs=self.state.max_epochs)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 892, in run
    return self._internal_run()
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 935, in _internal_run
    return next(self._internal_run_generator)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 993, in _internal_run_as_gen
    self._handle_exception(e)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 636, in _handle_exception
    self._fire_event(Events.EXCEPTION_RAISED, e)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 425, in _fire_event
    func(*first, *(event_args + others), **kwargs)
  File "/localhome/sachi/Projects/MONAI/monai/handlers/stats_handler.py", line 181, in exception_raised
    raise e
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 946, in _internal_run_as_gen
    self._fire_event(Events.STARTED)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/ignite/engine/engine.py", line 425, in _fire_event
    func(*first, *(event_args + others), **kwargs)
  File "/localhome/sachi/Projects/MONAI/monai/handlers/mlflow_handler.py", line 183, in start
    self._delete_exist_param_in_dict(attrs)
  File "/localhome/sachi/Projects/MONAI/monai/handlers/mlflow_handler.py", line 141, in _delete_exist_param_in_dict
    log_data = self.client.get_run(cur_run.info.run_id).data
  File "/localhome/sachi/.local/lib/python3.10/site-packages/mlflow/tracking/client.py", line 150, in get_run
    return self._tracking_client.get_run(run_id)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/mlflow/tracking/_tracking_service/client.py", line 72, in get_run
    return self.store.get_run(run_id)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 623, in get_run
    run_info = self._get_run_info(run_id)
  File "/localhome/sachi/.local/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 646, in _get_run_info
    raise MlflowException(
mlflow.exceptions.MlflowException: Run '1765aea084a3417586d052d9d8240039' not found
FAILED                                                                                                                                                           [ 72%]

```


### Types of changes
&lt;!--- Put an `x` in all the boxes that apply, and remove the not
applicable items --&gt;
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/`
folder.

Signed-off-by: Sachidanand Alle &lt;sachidanand.alle@gmail.com&gt;
diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py
@@ -21,6 +21,7 @@
 
 Events, _ = optional_import("ignite.engine", IgniteInfo.OPT_IMPORT_VERSION, min_version, "Events")
 mlflow, _ = optional_import("mlflow")
+mlflow.entities, _ = optional_import("mlflow.entities")
 
 if TYPE_CHECKING:
     from ignite.engine import Engine
@@ -52,7 +53,7 @@ class MLFlowHandler:
     Args:
         tracking_uri: connects to a tracking URI. can also set the `MLFLOW_TRACKING_URI` environment
             variable to have MLflow find a URI from there. in both cases, the URI can either be
-            a HTTP/HTTPS URI for a remote server, a database connection string, or a local path
+            an HTTP/HTTPS URI for a remote server, a database connection string, or a local path
             to log data to a directory. The URI defaults to path `mlruns`.
             for more details: https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.set_tracking_uri.
         iteration_log: whether to log data to MLFlow when iteration completed, default to `True`.
@@ -109,9 +110,6 @@ def __init__(
         optimizer_param_names: Union[str, Sequence[str]] = "lr",
         close_on_complete: bool = False,
     ) -> None:
-        if tracking_uri is not None:
-            mlflow.set_tracking_uri(tracking_uri)
-
         self.iteration_log = iteration_log
         self.epoch_log = epoch_log
         self.epoch_logger = epoch_logger
@@ -125,8 +123,10 @@ def __init__(
         self.experiment_param = experiment_param
         self.artifacts = ensure_tuple(artifacts)
         self.optimizer_param_names = ensure_tuple(optimizer_param_names)
-        self.client = mlflow.MlflowClient()
+        self.client = mlflow.MlflowClient(tracking_uri=tracking_uri if tracking_uri else None)
         self.close_on_complete = close_on_complete
+        self.experiment = None
+        self.cur_run = None
 
     def _delete_exist_param_in_dict(self, param_dict: Dict) -> None:
         """
@@ -135,9 +135,11 @@ def _delete_exist_param_in_dict(self, param_dict: Dict) -> None:
         Args:
             param_dict: parameter dict to be logged to mlflow.
         """
+        if self.cur_run is None:
+            return
+
         key_list = list(param_dict.keys())
-        cur_run = mlflow.active_run()
-        log_data = self.client.get_run(cur_run.info.run_id).data
+        log_data = self.client.get_run(self.cur_run.info.run_id).data
         log_param_dict = log_data.params
         for key in key_list:
             if key in log_param_dict:
@@ -167,17 +169,52 @@ def start(self, engine: Engine) -> None:
         Check MLFlow status and start if not active.
 
         """
-        mlflow.set_experiment(self.experiment_name)
-        if mlflow.active_run() is None:
+        self._set_experiment()
+        if not self.experiment:
+            raise ValueError(f"Failed to set experiment '{self.experiment_name}' as the active experiment")
+
+        if not self.cur_run:
             run_name = f"run_{time.strftime('%Y%m%d_%H%M%S')}" if self.run_name is None else self.run_name
-            mlflow.start_run(run_name=run_name)
+            runs = self.client.search_runs(self.experiment.experiment_id)
+            runs = [r for r in runs if r.info.run_name == run_name or not self.run_name]
+            if runs:
+                self.cur_run = self.client.get_run(runs[-1].info.run_id)  # pick latest active run
+            else:
+                self.cur_run = self.client.create_run(experiment_id=self.experiment.experiment_id, run_name=run_name)
 
         if self.experiment_param:
-            mlflow.log_params(self.experiment_param)
+            self._log_params(self.experiment_param)
 
         attrs = {attr: getattr(engine.state, attr, None) for attr in self.default_tracking_params}
         self._delete_exist_param_in_dict(attrs)
-        mlflow.log_params(attrs)
+        self._log_params(attrs)
+
+    def _set_experiment(self):
+        experiment = self.experiment
+        if not experiment:
+            experiment = self.client.get_experiment_by_name(self.experiment_name)
+            if not experiment:
+                experiment_id = self.client.create_experiment(self.experiment_name)
+                experiment = self.client.get_experiment(experiment_id)
+
+        if experiment.lifecycle_stage != mlflow.entities.LifecycleStage.ACTIVE:
+            raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment")
+        self.experiment = experiment
+
+    def _log_params(self, params: Dict[str, Any]) -> None:
+        if not self.cur_run:
+            raise ValueError("Current Run is not Active to log params")
+        params_arr = [mlflow.entities.Param(key, str(value)) for key, value in params.items()]
+        self.client.log_batch(run_id=self.cur_run.info.run_id, metrics=[], params=params_arr, tags=[])
+
+    def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
+        if not self.cur_run:
+            raise ValueError("Current Run is not Active to log metrics")
+
+        run_id = self.cur_run.info.run_id
+        timestamp = int(time.time() * 1000)
+        metrics_arr = [mlflow.entities.Metric(key, value, timestamp, step or 0) for key, value in metrics.items()]
+        self.client.log_batch(run_id=run_id, metrics=metrics_arr, params=[], tags=[])
 
     def _parse_artifacts(self):
         """
@@ -202,17 +239,20 @@ def complete(self) -> None:
         """
         Handler for train or validation/evaluation completed Event.
         """
-        if self.artifacts:
+        if self.artifacts and self.cur_run:
             artifact_list = self._parse_artifacts()
             for artifact in artifact_list:
-                mlflow.log_artifact(artifact)
+                self.client.log_artifact(self.cur_run.info.run_id, artifact)
 
     def close(self) -> None:
         """
         Stop current running logger of MLFlow.
 
         """
-        mlflow.end_run()
+        if self.cur_run:
+            status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED)
+            self.client.set_terminated(self.cur_run.info.run_id, status)
+            self.cur_run = None
 
     def epoch_completed(self, engine: Engine) -> None:
         """
@@ -257,11 +297,11 @@ def _default_epoch_log(self, engine: Engine) -> None:
             return
 
         current_epoch = self.global_epoch_transform(engine.state.epoch)
-        mlflow.log_metrics(log_dict, step=current_epoch)
+        self._log_metrics(log_dict, step=current_epoch)
 
         if self.state_attributes is not None:
             attrs = {attr: getattr(engine.state, attr, None) for attr in self.state_attributes}
-            mlflow.log_metrics(attrs, step=current_epoch)
+            self._log_metrics(attrs, step=current_epoch)
 
     def _default_iteration_log(self, engine: Engine) -> None:
         """
@@ -281,7 +321,7 @@ def _default_iteration_log(self, engine: Engine) -> None:
         if not isinstance(loss, dict):
             loss = {self.tag_name: loss.item() if isinstance(loss, torch.Tensor) else loss}
 
-        mlflow.log_metrics(loss, step=engine.state.iteration)
+        self._log_metrics(loss, step=engine.state.iteration)
 
         # If there is optimizer attr in engine, then record parameters specified in init function.
         if hasattr(engine, "optimizer"):
@@ -291,4 +331,4 @@ def _default_iteration_log(self, engine: Engine) -> None:
                     f"{param_name} group_{i}": float(param_group[param_name])
                     for i, param_group in enumerate(cur_optimizer.param_groups)
                 }
-                mlflow.log_metrics(params, step=engine.state.iteration)
+                self._log_metrics(params, step=engine.state.iteration)