Skip to content

Commit e989dc7

Browse files
dongyang0122Dong Yangpre-commit-ci[bot]dongywyli
authored
Enable auto3dseg maximize gpu utils (#5733)
### Description Enable auto3dseg maximize gpu utilization based on gpus in higher API levels. ### Types of changes <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. Signed-off-by: Dong Yang <dongy@dongy-mlt.client.nvidia.com> Signed-off-by: dongy <dongy@nvidia.com> Signed-off-by: Wenqi Li <wenqil@nvidia.com> Co-authored-by: Dong Yang <dongy@dongy-mlt.client.nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: dongy <dongy@nvidia.com> Co-authored-by: Wenqi Li <wenqil@nvidia.com> Co-authored-by: Wenqi Li <831580+wyli@users.noreply.github.com>
1 parent 9784506 commit e989dc7

6 files changed

Lines changed: 275 additions & 14 deletions

File tree

monai/apps/auto3dseg/auto_runner.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,9 @@ def __init__(
278278
self.set_ensemble_method(self.ensemble_method_name)
279279
self.set_num_fold(num_fold=self.num_fold)
280280

281+
self.gpu_customization = False
282+
self.gpu_customization_specs: Dict[str, Any] = {}
283+
281284
# hpo
282285
if hpo_backend.lower() != "nni":
283286
raise NotImplementedError("HPOGen backend only supports NNI")
@@ -332,6 +335,43 @@ def export_cache(self, **kwargs):
332335
self.cache, self.cache_filename, fmt="yaml", default_flow_style=None, sort_keys=False
333336
)
334337

338+
def set_gpu_customization(
339+
self, gpu_customization: bool = False, gpu_customization_specs: Optional[Dict[str, Any]] = None
340+
):
341+
"""
342+
Set options for GPU-based parameter customization/optimization.
343+
344+
Args:
345+
gpu_customization: the switch to determine automatically customize/optimize bundle script/config
346+
parameters for each bundleAlgo based on gpus. Custom parameters are obtained through dummy
347+
training to simulate the actual model training process and hyperparameter optimization (HPO)
348+
experiments.
349+
gpu_customization_specs (optinal): the dictionary to enable users overwrite the HPO settings. user can
350+
overwrite part of variables as follows or all of them. The structure is as follows.
351+
352+
.. code-block:: python
353+
354+
gpu_customization_specs = {
355+
'ALOG': {
356+
'num_trials': 6,
357+
'range_num_images_per_batch': [1, 20],
358+
'range_num_sw_batch_size': [1, 20]
359+
}
360+
}
361+
362+
ALGO: the name of algorithm. It could be one of algorithm names (e.g., 'dints') or 'unversal' which
363+
would apply changes to all algorithms. Possible options are
364+
365+
- {``"unversal"``, ``"dints"``, ``"segresnet"``, ``"segresnet2d"``, ``"swinunetr"``}.
366+
367+
num_trials: the number of HPO trials/experiments to run.
368+
range_num_images_per_batch: the range of number of images per mini-batch.
369+
range_num_sw_batch_size: the range of batch size in sliding-window inferer.
370+
"""
371+
self.gpu_customization = gpu_customization
372+
if gpu_customization_specs is not None:
373+
self.gpu_customization_specs = gpu_customization_specs
374+
335375
def set_num_fold(self, num_fold: int = 5):
336376
"""
337377
Set the number of cross validation folds for all algos.
@@ -614,7 +654,15 @@ def run(self):
614654
data_src_cfg_name=self.data_src_cfg_name,
615655
)
616656

617-
bundle_generator.generate(self.work_dir, num_fold=self.num_fold)
657+
if self.gpu_customization:
658+
bundle_generator.generate(
659+
self.work_dir,
660+
num_fold=self.num_fold,
661+
gpu_customization=self.gpu_customization,
662+
gpu_customization_specs=self.gpu_customization_specs,
663+
)
664+
else:
665+
bundle_generator.generate(self.work_dir, num_fold=self.num_fold)
618666
history = bundle_generator.get_history()
619667
export_bundle_algo_history(history)
620668
self.export_cache(algo_gen=True)

monai/apps/auto3dseg/bundle_gen.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from monai.utils import ensure_tuple
3333

3434
logger = get_logger(module_name=__name__)
35-
ALGO_HASH = os.environ.get("MONAI_ALGO_HASH", "5895e1b")
35+
ALGO_HASH = os.environ.get("MONAI_ALGO_HASH", "c812e5f")
3636

3737
__all__ = ["BundleAlgo", "BundleGen"]
3838

@@ -458,13 +458,44 @@ def get_history(self) -> List:
458458
"""get the history of the bundleAlgo object with their names/identifiers"""
459459
return self.history
460460

461-
def generate(self, output_folder=".", num_fold: int = 5):
461+
def generate(
462+
self,
463+
output_folder=".",
464+
num_fold: int = 5,
465+
gpu_customization: bool = False,
466+
gpu_customization_specs: Optional[Dict[str, Any]] = None,
467+
):
462468
"""
463469
Generate the bundle scripts/configs for each bundleAlgo
464470
465471
Args:
466472
output_folder: the output folder to save each algorithm.
467-
num_fold: the number of cross validation fold
473+
num_fold: the number of cross validation fold.
474+
gpu_customization: the switch to determine automatically customize/optimize bundle script/config
475+
parameters for each bundleAlgo based on gpus. Custom parameters are obtained through dummy
476+
training to simulate the actual model training process and hyperparameter optimization (HPO)
477+
experiments.
478+
gpu_customization_specs (optinal): the dictionary to enable users overwrite the HPO settings. user can
479+
overwrite part of variables as follows or all of them. The structure is as follows.
480+
481+
.. code-block:: python
482+
483+
gpu_customization_specs = {
484+
'ALOG': {
485+
'num_trials': 6,
486+
'range_num_images_per_batch': [1, 20],
487+
'range_num_sw_batch_size': [1, 20]
488+
}
489+
}
490+
491+
ALGO: the name of algorithm. It could be one of algorithm names (e.g., 'dints') or 'unversal' which
492+
would apply changes to all algorithms. Possible options are
493+
494+
- {``"unversal"``, ``"dints"``, ``"segresnet"``, ``"segresnet2d"``, ``"swinunetr"``}.
495+
496+
num_trials: the number of HPO trials/experiments to run.
497+
range_num_images_per_batch: the range of number of images per mini-batch.
498+
range_num_sw_batch_size: the range of batch size in sliding-window inferer.
468499
"""
469500
fold_idx = list(range(num_fold))
470501
for algo in self.algos:
@@ -475,6 +506,15 @@ def generate(self, output_folder=".", num_fold: int = 5):
475506
gen_algo.set_data_stats(data_stats)
476507
gen_algo.set_data_source(data_src_cfg)
477508
name = f"{gen_algo.name}_{f_id}"
478-
gen_algo.export_to_disk(output_folder, name, fold=f_id)
509+
if gpu_customization:
510+
gen_algo.export_to_disk(
511+
output_folder,
512+
name,
513+
fold=f_id,
514+
gpu_customization=True,
515+
gpu_customization_specs=gpu_customization_specs,
516+
)
517+
else:
518+
gen_algo.export_to_disk(output_folder, name, fold=f_id)
479519
algo_to_pickle(gen_algo, template_path=algo.template_path)
480520
self.history.append({name: gen_algo}) # track the previous, may create a persistent history

tests/min_tests.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def run_testsuit():
104104
"test_image_rw",
105105
"test_img2tensorboard",
106106
"test_integration_fast_train",
107+
"test_integration_gpu_customization",
107108
"test_integration_segmentation_3d",
108109
"test_integration_sliding_window",
109110
"test_integration_unet_2d",

tests/test_integration_autorunner.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,19 @@ def test_autorunner_ensemble(self) -> None:
125125
with skip_if_downloading_fails():
126126
runner.run()
127127

128+
@skip_if_no_cuda
129+
def test_autorunner_gpu_customization(self) -> None:
130+
work_dir = os.path.join(self.test_path, "work_dir")
131+
runner = AutoRunner(work_dir=work_dir, input=self.data_src_cfg)
132+
gpu_customization_specs = {
133+
"universal": {"num_trials": 1, "range_num_images_per_batch": [1, 2], "range_num_sw_batch_size": [1, 2]}
134+
}
135+
runner.set_gpu_customization(gpu_customization=True, gpu_customization_specs=gpu_customization_specs)
136+
runner.set_training_params(train_param) # 2 epochs
137+
runner.set_num_fold(1)
138+
with skip_if_downloading_fails():
139+
runner.run()
140+
128141
@skip_if_no_cuda
129142
@unittest.skipIf(not has_nni, "nni required")
130143
def test_autorunner_hpo(self) -> None:
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# Copyright (c) MONAI Consortium
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import os
13+
import tempfile
14+
import unittest
15+
from typing import Dict, List
16+
17+
import nibabel as nib
18+
import numpy as np
19+
import torch
20+
21+
from monai.apps.auto3dseg import AlgoEnsembleBestByFold, AlgoEnsembleBestN, AlgoEnsembleBuilder, BundleGen, DataAnalyzer
22+
from monai.bundle.config_parser import ConfigParser
23+
from monai.data import create_test_image_3d
24+
from monai.utils import optional_import
25+
from monai.utils.enums import AlgoEnsembleKeys
26+
from tests.utils import SkipIfBeforePyTorchVersion, skip_if_downloading_fails, skip_if_no_cuda, skip_if_quick
27+
28+
_, has_tb = optional_import("torch.utils.tensorboard", name="SummaryWriter")
29+
30+
fake_datalist: Dict[str, List[Dict]] = {
31+
"testing": [{"image": "val_001.fake.nii.gz"}, {"image": "val_002.fake.nii.gz"}],
32+
"training": [
33+
{"fold": 0, "image": "tr_image_001.fake.nii.gz", "label": "tr_label_001.fake.nii.gz"},
34+
{"fold": 0, "image": "tr_image_002.fake.nii.gz", "label": "tr_label_002.fake.nii.gz"},
35+
{"fold": 0, "image": "tr_image_003.fake.nii.gz", "label": "tr_label_003.fake.nii.gz"},
36+
{"fold": 0, "image": "tr_image_004.fake.nii.gz", "label": "tr_label_004.fake.nii.gz"},
37+
{"fold": 1, "image": "tr_image_005.fake.nii.gz", "label": "tr_label_005.fake.nii.gz"},
38+
{"fold": 1, "image": "tr_image_006.fake.nii.gz", "label": "tr_label_006.fake.nii.gz"},
39+
{"fold": 1, "image": "tr_image_007.fake.nii.gz", "label": "tr_label_007.fake.nii.gz"},
40+
{"fold": 1, "image": "tr_image_008.fake.nii.gz", "label": "tr_label_008.fake.nii.gz"},
41+
{"fold": 2, "image": "tr_image_009.fake.nii.gz", "label": "tr_label_009.fake.nii.gz"},
42+
{"fold": 2, "image": "tr_image_010.fake.nii.gz", "label": "tr_label_010.fake.nii.gz"},
43+
{"fold": 2, "image": "tr_image_011.fake.nii.gz", "label": "tr_label_011.fake.nii.gz"},
44+
{"fold": 2, "image": "tr_image_012.fake.nii.gz", "label": "tr_label_012.fake.nii.gz"},
45+
],
46+
}
47+
48+
num_gpus = 4 if torch.cuda.device_count() > 4 else torch.cuda.device_count()
49+
train_param = (
50+
{
51+
"CUDA_VISIBLE_DEVICES": list(range(num_gpus)),
52+
"num_iterations": int(4 / num_gpus),
53+
"num_iterations_per_validation": int(4 / num_gpus),
54+
"num_images_per_batch": 2,
55+
"num_epochs": 1,
56+
"num_warmup_iterations": int(4 / num_gpus),
57+
"use_pretrain": False,
58+
"pretrained_path": "",
59+
}
60+
if torch.cuda.is_available()
61+
else {}
62+
)
63+
64+
pred_param = {"files_slices": slice(0, 1), "mode": "mean", "sigmoid": True}
65+
66+
67+
@skip_if_quick
68+
@SkipIfBeforePyTorchVersion((1, 9, 1))
69+
@unittest.skipIf(not has_tb, "no tensorboard summary writer")
70+
class TestEnsembleGpuCustomization(unittest.TestCase):
71+
def setUp(self) -> None:
72+
self.test_dir = tempfile.TemporaryDirectory()
73+
74+
@skip_if_no_cuda
75+
def test_ensemble_gpu_customization(self) -> None:
76+
test_path = self.test_dir.name
77+
78+
dataroot = os.path.join(test_path, "dataroot")
79+
work_dir = os.path.join(test_path, "workdir")
80+
81+
da_output_yaml = os.path.join(work_dir, "datastats.yaml")
82+
data_src_cfg = os.path.join(work_dir, "data_src_cfg.yaml")
83+
84+
if not os.path.isdir(dataroot):
85+
os.makedirs(dataroot)
86+
87+
if not os.path.isdir(work_dir):
88+
os.makedirs(work_dir)
89+
90+
# Generate a fake dataset
91+
for d in fake_datalist["testing"] + fake_datalist["training"]:
92+
im, seg = create_test_image_3d(24, 24, 24, rad_max=10, num_seg_classes=1)
93+
nib_image = nib.Nifti1Image(im, affine=np.eye(4))
94+
image_fpath = os.path.join(dataroot, d["image"])
95+
nib.save(nib_image, image_fpath)
96+
97+
if "label" in d:
98+
nib_image = nib.Nifti1Image(seg, affine=np.eye(4))
99+
label_fpath = os.path.join(dataroot, d["label"])
100+
nib.save(nib_image, label_fpath)
101+
102+
# write to a json file
103+
fake_json_datalist = os.path.join(dataroot, "fake_input.json")
104+
ConfigParser.export_config_file(fake_datalist, fake_json_datalist)
105+
106+
da = DataAnalyzer(fake_json_datalist, dataroot, output_path=da_output_yaml)
107+
da.get_all_case_stats()
108+
109+
data_src = {
110+
"name": "fake_data",
111+
"task": "segmentation",
112+
"modality": "MRI",
113+
"datalist": fake_json_datalist,
114+
"dataroot": dataroot,
115+
"multigpu": False,
116+
"class_names": ["label_class"],
117+
}
118+
119+
ConfigParser.export_config_file(data_src, data_src_cfg)
120+
121+
with skip_if_downloading_fails():
122+
bundle_generator = BundleGen(
123+
algo_path=work_dir, data_stats_filename=da_output_yaml, data_src_cfg_name=data_src_cfg
124+
)
125+
126+
gpu_customization_specs = {
127+
"universal": {"num_trials": 1, "range_num_images_per_batch": [1, 2], "range_num_sw_batch_size": [1, 2]}
128+
}
129+
bundle_generator.generate(
130+
work_dir, num_fold=1, gpu_customization=True, gpu_customization_specs=gpu_customization_specs
131+
)
132+
history = bundle_generator.get_history()
133+
134+
for h in history:
135+
self.assertEqual(len(h.keys()), 1, "each record should have one model")
136+
for _, algo in h.items():
137+
algo.train(train_param)
138+
139+
builder = AlgoEnsembleBuilder(history, data_src_cfg)
140+
builder.set_ensemble_method(AlgoEnsembleBestN(n_best=2))
141+
ensemble = builder.get_ensemble()
142+
preds = ensemble(pred_param)
143+
self.assertTupleEqual(preds[0].shape, (2, 24, 24, 24))
144+
145+
builder.set_ensemble_method(AlgoEnsembleBestByFold(1))
146+
ensemble = builder.get_ensemble()
147+
for algo in ensemble.get_algo_ensemble():
148+
print(algo[AlgoEnsembleKeys.ID])
149+
150+
def tearDown(self) -> None:
151+
self.test_dir.cleanup()
152+
153+
154+
if __name__ == "__main__":
155+
unittest.main()

tests/utils.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,20 @@ def skip_if_downloading_fails():
130130
if "decryption failed" in str(ssl_e):
131131
raise unittest.SkipTest(f"SSL error while downloading: {ssl_e}") from ssl_e
132132
except (RuntimeError, OSError) as rt_e:
133-
if "unexpected EOF" in str(rt_e):
133+
err_str = str(rt_e)
134+
if any(
135+
k in err_str
136+
for k in (
137+
"unexpected EOF", # incomplete download
138+
"network issue",
139+
"gdown dependency", # gdown not installed
140+
"md5 check",
141+
"limit", # HTTP Error 503: Egress is over the account limit
142+
"authenticate",
143+
)
144+
):
134145
raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e # incomplete download
135-
if "network issue" in str(rt_e):
136-
raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
137-
if "gdown dependency" in str(rt_e): # no gdown installed
138-
raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
139-
if "md5 check" in str(rt_e):
140-
raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
141-
if "limit" in str(rt_e): # HTTP Error 503: Egress is over the account limit
142-
raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e
146+
143147
raise rt_e
144148

145149

0 commit comments

Comments
 (0)