Enable auto3dseg maximize gpu utils (#5733)

dongyang0122 · Dong Yang · pre-commit-ci[bot] · web-flow · commit e989dc7df8a1 · 2022-12-15T12:15:58.000Z
### Description

Enable auto3dseg maximize gpu utilization based on gpus in higher API
levels.

### Types of changes
&lt;!--- Put an `x` in all the boxes that apply, and remove the not
applicable items --&gt;
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/`
folder.

Signed-off-by: Dong Yang &lt;dongy@dongy-mlt.client.nvidia.com&gt;
Signed-off-by: dongy &lt;dongy@nvidia.com&gt;
Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;
Co-authored-by: Dong Yang &lt;dongy@dongy-mlt.client.nvidia.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: dongy &lt;dongy@nvidia.com&gt;
Co-authored-by: Wenqi Li &lt;wenqil@nvidia.com&gt;
Co-authored-by: Wenqi Li &lt;831580+wyli@users.noreply.github.com&gt;
diff --git a/monai/apps/auto3dseg/auto_runner.py b/monai/apps/auto3dseg/auto_runner.py
@@ -278,6 +278,9 @@ def __init__(
         self.set_ensemble_method(self.ensemble_method_name)
         self.set_num_fold(num_fold=self.num_fold)
 
+        self.gpu_customization = False
+        self.gpu_customization_specs: Dict[str, Any] = {}
+
         # hpo
         if hpo_backend.lower() != "nni":
             raise NotImplementedError("HPOGen backend only supports NNI")
@@ -332,6 +335,43 @@ def export_cache(self, **kwargs):
             self.cache, self.cache_filename, fmt="yaml", default_flow_style=None, sort_keys=False
         )
 
+    def set_gpu_customization(
+        self, gpu_customization: bool = False, gpu_customization_specs: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Set options for GPU-based parameter customization/optimization.
+
+        Args:
+            gpu_customization: the switch to determine automatically customize/optimize bundle script/config
+                parameters for each bundleAlgo based on gpus. Custom parameters are obtained through dummy
+                training to simulate the actual model training process and hyperparameter optimization (HPO)
+                experiments.
+            gpu_customization_specs (optinal): the dictionary to enable users overwrite the HPO settings. user can
+                overwrite part of variables as follows or all of them. The structure is as follows.
+
+                .. code-block:: python
+
+                    gpu_customization_specs = {
+                        'ALOG': {
+                            'num_trials': 6,
+                            'range_num_images_per_batch': [1, 20],
+                            'range_num_sw_batch_size': [1, 20]
+                        }
+                    }
+
+            ALGO: the name of algorithm. It could be one of algorithm names (e.g., 'dints') or 'unversal' which
+                would apply changes to all algorithms. Possible options are
+
+                - {``"unversal"``, ``"dints"``, ``"segresnet"``, ``"segresnet2d"``, ``"swinunetr"``}.
+
+            num_trials: the number of HPO trials/experiments to run.
+            range_num_images_per_batch: the range of number of images per mini-batch.
+            range_num_sw_batch_size: the range of batch size in sliding-window inferer.
+        """
+        self.gpu_customization = gpu_customization
+        if gpu_customization_specs is not None:
+            self.gpu_customization_specs = gpu_customization_specs
+
     def set_num_fold(self, num_fold: int = 5):
         """
         Set the number of cross validation folds for all algos.
@@ -614,7 +654,15 @@ def run(self):
                 data_src_cfg_name=self.data_src_cfg_name,
             )
 
-            bundle_generator.generate(self.work_dir, num_fold=self.num_fold)
+            if self.gpu_customization:
+                bundle_generator.generate(
+                    self.work_dir,
+                    num_fold=self.num_fold,
+                    gpu_customization=self.gpu_customization,
+                    gpu_customization_specs=self.gpu_customization_specs,
+                )
+            else:
+                bundle_generator.generate(self.work_dir, num_fold=self.num_fold)
             history = bundle_generator.get_history()
             export_bundle_algo_history(history)
             self.export_cache(algo_gen=True)
diff --git a/monai/apps/auto3dseg/bundle_gen.py b/monai/apps/auto3dseg/bundle_gen.py
@@ -32,7 +32,7 @@
 from monai.utils import ensure_tuple
 
 logger = get_logger(module_name=__name__)
-ALGO_HASH = os.environ.get("MONAI_ALGO_HASH", "5895e1b")
+ALGO_HASH = os.environ.get("MONAI_ALGO_HASH", "c812e5f")
 
 __all__ = ["BundleAlgo", "BundleGen"]
 
@@ -458,13 +458,44 @@ def get_history(self) -> List:
         """get the history of the bundleAlgo object with their names/identifiers"""
         return self.history
 
-    def generate(self, output_folder=".", num_fold: int = 5):
+    def generate(
+        self,
+        output_folder=".",
+        num_fold: int = 5,
+        gpu_customization: bool = False,
+        gpu_customization_specs: Optional[Dict[str, Any]] = None,
+    ):
         """
         Generate the bundle scripts/configs for each bundleAlgo
 
         Args:
             output_folder: the output folder to save each algorithm.
-            num_fold: the number of cross validation fold
+            num_fold: the number of cross validation fold.
+            gpu_customization: the switch to determine automatically customize/optimize bundle script/config
+                parameters for each bundleAlgo based on gpus. Custom parameters are obtained through dummy
+                training to simulate the actual model training process and hyperparameter optimization (HPO)
+                experiments.
+            gpu_customization_specs (optinal): the dictionary to enable users overwrite the HPO settings. user can
+                overwrite part of variables as follows or all of them. The structure is as follows.
+
+                .. code-block:: python
+
+                    gpu_customization_specs = {
+                        'ALOG': {
+                            'num_trials': 6,
+                            'range_num_images_per_batch': [1, 20],
+                            'range_num_sw_batch_size': [1, 20]
+                        }
+                    }
+
+            ALGO: the name of algorithm. It could be one of algorithm names (e.g., 'dints') or 'unversal' which
+                would apply changes to all algorithms. Possible options are
+
+                - {``"unversal"``, ``"dints"``, ``"segresnet"``, ``"segresnet2d"``, ``"swinunetr"``}.
+
+            num_trials: the number of HPO trials/experiments to run.
+            range_num_images_per_batch: the range of number of images per mini-batch.
+            range_num_sw_batch_size: the range of batch size in sliding-window inferer.
         """
         fold_idx = list(range(num_fold))
         for algo in self.algos:
@@ -475,6 +506,15 @@ def generate(self, output_folder=".", num_fold: int = 5):
                 gen_algo.set_data_stats(data_stats)
                 gen_algo.set_data_source(data_src_cfg)
                 name = f"{gen_algo.name}_{f_id}"
-                gen_algo.export_to_disk(output_folder, name, fold=f_id)
+                if gpu_customization:
+                    gen_algo.export_to_disk(
+                        output_folder,
+                        name,
+                        fold=f_id,
+                        gpu_customization=True,
+                        gpu_customization_specs=gpu_customization_specs,
+                    )
+                else:
+                    gen_algo.export_to_disk(output_folder, name, fold=f_id)
                 algo_to_pickle(gen_algo, template_path=algo.template_path)
                 self.history.append({name: gen_algo})  # track the previous, may create a persistent history
diff --git a/tests/min_tests.py b/tests/min_tests.py
@@ -104,6 +104,7 @@ def run_testsuit():
         "test_image_rw",
         "test_img2tensorboard",
         "test_integration_fast_train",
+        "test_integration_gpu_customization",
         "test_integration_segmentation_3d",
         "test_integration_sliding_window",
         "test_integration_unet_2d",
diff --git a/tests/test_integration_autorunner.py b/tests/test_integration_autorunner.py
@@ -125,6 +125,19 @@ def test_autorunner_ensemble(self) -> None:
         with skip_if_downloading_fails():
             runner.run()
 
+    @skip_if_no_cuda
+    def test_autorunner_gpu_customization(self) -> None:
+        work_dir = os.path.join(self.test_path, "work_dir")
+        runner = AutoRunner(work_dir=work_dir, input=self.data_src_cfg)
+        gpu_customization_specs = {
+            "universal": {"num_trials": 1, "range_num_images_per_batch": [1, 2], "range_num_sw_batch_size": [1, 2]}
+        }
+        runner.set_gpu_customization(gpu_customization=True, gpu_customization_specs=gpu_customization_specs)
+        runner.set_training_params(train_param)  # 2 epochs
+        runner.set_num_fold(1)
+        with skip_if_downloading_fails():
+            runner.run()
+
     @skip_if_no_cuda
     @unittest.skipIf(not has_nni, "nni required")
     def test_autorunner_hpo(self) -> None:
diff --git a/tests/test_integration_gpu_customization.py b/tests/test_integration_gpu_customization.py
@@ -0,0 +1,155 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+from typing import Dict, List
+
+import nibabel as nib
+import numpy as np
+import torch
+
+from monai.apps.auto3dseg import AlgoEnsembleBestByFold, AlgoEnsembleBestN, AlgoEnsembleBuilder, BundleGen, DataAnalyzer
+from monai.bundle.config_parser import ConfigParser
+from monai.data import create_test_image_3d
+from monai.utils import optional_import
+from monai.utils.enums import AlgoEnsembleKeys
+from tests.utils import SkipIfBeforePyTorchVersion, skip_if_downloading_fails, skip_if_no_cuda, skip_if_quick
+
+_, has_tb = optional_import("torch.utils.tensorboard", name="SummaryWriter")
+
+fake_datalist: Dict[str, List[Dict]] = {
+    "testing": [{"image": "val_001.fake.nii.gz"}, {"image": "val_002.fake.nii.gz"}],
+    "training": [
+        {"fold": 0, "image": "tr_image_001.fake.nii.gz", "label": "tr_label_001.fake.nii.gz"},
+        {"fold": 0, "image": "tr_image_002.fake.nii.gz", "label": "tr_label_002.fake.nii.gz"},
+        {"fold": 0, "image": "tr_image_003.fake.nii.gz", "label": "tr_label_003.fake.nii.gz"},
+        {"fold": 0, "image": "tr_image_004.fake.nii.gz", "label": "tr_label_004.fake.nii.gz"},
+        {"fold": 1, "image": "tr_image_005.fake.nii.gz", "label": "tr_label_005.fake.nii.gz"},
+        {"fold": 1, "image": "tr_image_006.fake.nii.gz", "label": "tr_label_006.fake.nii.gz"},
+        {"fold": 1, "image": "tr_image_007.fake.nii.gz", "label": "tr_label_007.fake.nii.gz"},
+        {"fold": 1, "image": "tr_image_008.fake.nii.gz", "label": "tr_label_008.fake.nii.gz"},
+        {"fold": 2, "image": "tr_image_009.fake.nii.gz", "label": "tr_label_009.fake.nii.gz"},
+        {"fold": 2, "image": "tr_image_010.fake.nii.gz", "label": "tr_label_010.fake.nii.gz"},
+        {"fold": 2, "image": "tr_image_011.fake.nii.gz", "label": "tr_label_011.fake.nii.gz"},
+        {"fold": 2, "image": "tr_image_012.fake.nii.gz", "label": "tr_label_012.fake.nii.gz"},
+    ],
+}
+
+num_gpus = 4 if torch.cuda.device_count() > 4 else torch.cuda.device_count()
+train_param = (
+    {
+        "CUDA_VISIBLE_DEVICES": list(range(num_gpus)),
+        "num_iterations": int(4 / num_gpus),
+        "num_iterations_per_validation": int(4 / num_gpus),
+        "num_images_per_batch": 2,
+        "num_epochs": 1,
+        "num_warmup_iterations": int(4 / num_gpus),
+        "use_pretrain": False,
+        "pretrained_path": "",
+    }
+    if torch.cuda.is_available()
+    else {}
+)
+
+pred_param = {"files_slices": slice(0, 1), "mode": "mean", "sigmoid": True}
+
+
+@skip_if_quick
+@SkipIfBeforePyTorchVersion((1, 9, 1))
+@unittest.skipIf(not has_tb, "no tensorboard summary writer")
+class TestEnsembleGpuCustomization(unittest.TestCase):
+    def setUp(self) -> None:
+        self.test_dir = tempfile.TemporaryDirectory()
+
+    @skip_if_no_cuda
+    def test_ensemble_gpu_customization(self) -> None:
+        test_path = self.test_dir.name
+
+        dataroot = os.path.join(test_path, "dataroot")
+        work_dir = os.path.join(test_path, "workdir")
+
+        da_output_yaml = os.path.join(work_dir, "datastats.yaml")
+        data_src_cfg = os.path.join(work_dir, "data_src_cfg.yaml")
+
+        if not os.path.isdir(dataroot):
+            os.makedirs(dataroot)
+
+        if not os.path.isdir(work_dir):
+            os.makedirs(work_dir)
+
+        # Generate a fake dataset
+        for d in fake_datalist["testing"] + fake_datalist["training"]:
+            im, seg = create_test_image_3d(24, 24, 24, rad_max=10, num_seg_classes=1)
+            nib_image = nib.Nifti1Image(im, affine=np.eye(4))
+            image_fpath = os.path.join(dataroot, d["image"])
+            nib.save(nib_image, image_fpath)
+
+            if "label" in d:
+                nib_image = nib.Nifti1Image(seg, affine=np.eye(4))
+                label_fpath = os.path.join(dataroot, d["label"])
+                nib.save(nib_image, label_fpath)
+
+        # write to a json file
+        fake_json_datalist = os.path.join(dataroot, "fake_input.json")
+        ConfigParser.export_config_file(fake_datalist, fake_json_datalist)
+
+        da = DataAnalyzer(fake_json_datalist, dataroot, output_path=da_output_yaml)
+        da.get_all_case_stats()
+
+        data_src = {
+            "name": "fake_data",
+            "task": "segmentation",
+            "modality": "MRI",
+            "datalist": fake_json_datalist,
+            "dataroot": dataroot,
+            "multigpu": False,
+            "class_names": ["label_class"],
+        }
+
+        ConfigParser.export_config_file(data_src, data_src_cfg)
+
+        with skip_if_downloading_fails():
+            bundle_generator = BundleGen(
+                algo_path=work_dir, data_stats_filename=da_output_yaml, data_src_cfg_name=data_src_cfg
+            )
+
+        gpu_customization_specs = {
+            "universal": {"num_trials": 1, "range_num_images_per_batch": [1, 2], "range_num_sw_batch_size": [1, 2]}
+        }
+        bundle_generator.generate(
+            work_dir, num_fold=1, gpu_customization=True, gpu_customization_specs=gpu_customization_specs
+        )
+        history = bundle_generator.get_history()
+
+        for h in history:
+            self.assertEqual(len(h.keys()), 1, "each record should have one model")
+            for _, algo in h.items():
+                algo.train(train_param)
+
+        builder = AlgoEnsembleBuilder(history, data_src_cfg)
+        builder.set_ensemble_method(AlgoEnsembleBestN(n_best=2))
+        ensemble = builder.get_ensemble()
+        preds = ensemble(pred_param)
+        self.assertTupleEqual(preds[0].shape, (2, 24, 24, 24))
+
+        builder.set_ensemble_method(AlgoEnsembleBestByFold(1))
+        ensemble = builder.get_ensemble()
+        for algo in ensemble.get_algo_ensemble():
+            print(algo[AlgoEnsembleKeys.ID])
+
+    def tearDown(self) -> None:
+        self.test_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/utils.py b/tests/utils.py
@@ -130,16 +130,20 @@ def skip_if_downloading_fails():
         if "decryption failed" in str(ssl_e):
             raise unittest.SkipTest(f"SSL error while downloading: {ssl_e}") from ssl_e
     except (RuntimeError, OSError) as rt_e:
-        if "unexpected EOF" in str(rt_e):
+        err_str = str(rt_e)
+        if any(
+            k in err_str
+            for k in (
+                "unexpected EOF",  # incomplete download
+                "network issue",
+                "gdown dependency",  # gdown not installed
+                "md5 check",
+                "limit",  # HTTP Error 503: Egress is over the account limit
+                "authenticate",
+            )
+        ):
             raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e  # incomplete download
-        if "network issue" in str(rt_e):
-            raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
-        if "gdown dependency" in str(rt_e):  # no gdown installed
-            raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
-        if "md5 check" in str(rt_e):
-            raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
-        if "limit" in str(rt_e):  # HTTP Error 503: Egress is over the account limit
-            raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
+
         raise rt_e