Skip to content

Commit cf21930

Browse files
committed
base functionality
1 parent 5b56127 commit cf21930

2 files changed

Lines changed: 135 additions & 45 deletions

File tree

openml/extensions/sklearn/extension.py

Lines changed: 127 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -888,35 +888,56 @@ def _format_external_version(
888888
) -> str:
889889
return '%s==%s' % (model_package_name, model_package_version_number)
890890

891-
def _check_n_jobs(self, model: Any) -> bool:
892-
"""Returns True if the parameter settings of model are chosen s.t. the model
893-
will run on a single core (if so, openml-python can measure runtimes)"""
894-
895-
def check(param_grid, restricted_parameter_name, legal_values):
896-
if isinstance(param_grid, dict):
897-
for param, value in param_grid.items():
898-
# n_jobs is scikitlearn parameter for paralizing jobs
899-
if param.split('__')[-1] == restricted_parameter_name:
900-
# 0 = illegal value (?), 1 / None = use one core,
901-
# n = use n cores,
902-
# -1 = use all available cores -> this makes it hard to
903-
# measure runtime in a fair way
904-
if legal_values is None or value not in legal_values:
905-
return False
906-
return True
907-
elif isinstance(param_grid, list):
908-
return all(
909-
check(sub_grid, restricted_parameter_name, legal_values)
910-
for sub_grid in param_grid
911-
)
891+
@staticmethod
892+
def _check_parameter_value_recursive(param_grid: Union[Dict, List[Dict]], parameter_name: str, legal_values: Optional[List]):
893+
"""
894+
Checks within a flow (recursively) whether a given hyperparameter complies to one of the values presented in a
895+
grid. If the hyperparameter does not exist in the grid, True is returned.
912896
913-
if not (
914-
isinstance(model, sklearn.base.BaseEstimator) or self.is_hpo_class(model)
915-
):
916-
raise ValueError('model should be BaseEstimator or BaseSearchCV')
897+
Parameters
898+
----------
899+
param_grid: Union[Dict, List[Dict]]
900+
Dict mapping from hyperparameter list to value, to a list of such dicts
901+
902+
parameter_name: str
903+
The hyperparameter that needs to be inspected
904+
905+
legal_values: List
906+
The values that are accepted. None if no values are legal (the presence of the hyperparameter will trigger
907+
to return False)
917908
918-
# make sure that n_jobs is not in the parameter grid of optimization
919-
# procedure
909+
Returns
910+
-------
911+
bool
912+
True if all occurrences of the hyperparameter only have legal values, False otherwise
913+
914+
"""
915+
if isinstance(param_grid, dict):
916+
for param, value in param_grid.items():
917+
# n_jobs is scikitlearn parameter for paralizing jobs
918+
if param.split('__')[-1] == parameter_name:
919+
# 0 = illegal value (?), 1 / None = use one core,
920+
# n = use n cores,
921+
# -1 = use all available cores -> this makes it hard to
922+
# measure runtime in a fair way
923+
if legal_values is None or value not in legal_values:
924+
return False
925+
return True
926+
elif isinstance(param_grid, list):
927+
return all(
928+
SklearnExtension._check_parameter_value_recursive(sub_grid, parameter_name, legal_values)
929+
for sub_grid in param_grid
930+
)
931+
932+
def _prevent_optimize_n_jobs(self, model):
933+
"""
934+
Ensures that HPO classess will not optimize the n_jobs hyperparameter
935+
936+
Parameters:
937+
-----------
938+
model:
939+
The model that will be fitted
940+
"""
920941
if self.is_hpo_class(model):
921942
if isinstance(model, sklearn.model_selection.GridSearchCV):
922943
param_distributions = model.param_grid
@@ -934,12 +955,55 @@ def check(param_grid, restricted_parameter_name, legal_values):
934955
'{GridSearchCV, RandomizedSearchCV}. '
935956
'Should implement param check. ')
936957

937-
if not check(param_distributions, 'n_jobs', None):
958+
if not SklearnExtension._check_parameter_value_recursive(param_distributions, 'n_jobs', None):
938959
raise PyOpenMLError('openml-python should not be used to '
939960
'optimize the n_jobs parameter.')
940961

962+
def _can_measure_cputime(self, model: Any) -> bool:
963+
"""
964+
Returns True if the parameter settings of model are chosen s.t. the model
965+
will run on a single core (if so, openml-python can measure cpu-times)
966+
967+
Parameters:
968+
-----------
969+
model:
970+
The model that will be fitted
971+
972+
Returns:
973+
--------
974+
bool:
975+
True if all n_jobs parameters will be either set to None or 1, False otherwise
976+
"""
977+
if not (
978+
isinstance(model, sklearn.base.BaseEstimator) or self.is_hpo_class(model)
979+
):
980+
raise ValueError('model should be BaseEstimator or BaseSearchCV')
981+
982+
# check the parameters for n_jobs
983+
return SklearnExtension._check_parameter_value_recursive(model.get_params(), 'n_jobs', [1, None])
984+
985+
def _can_measure_wallclocktime(self, model: Any) -> bool:
986+
"""
987+
Returns True if the parameter settings of model are chosen s.t. the model
988+
will run on a preset number of cores (if so, openml-python can measure wallclock time)
989+
990+
Parameters:
991+
-----------
992+
model:
993+
The model that will be fitted
994+
995+
Returns:
996+
--------
997+
bool:
998+
True if none n_jobs parameters is set ot -1, False otherwise
999+
"""
1000+
if not (
1001+
isinstance(model, sklearn.base.BaseEstimator) or self.is_hpo_class(model)
1002+
):
1003+
raise ValueError('model should be BaseEstimator or BaseSearchCV')
1004+
9411005
# check the parameters for n_jobs
942-
return check(model.get_params(), 'n_jobs', [1, None])
1006+
return not SklearnExtension._check_parameter_value_recursive(model.get_params(), 'n_jobs', [-1])
9431007

9441008
################################################################################################
9451009
# Methods for performing runs with extension modules
@@ -1112,8 +1176,11 @@ def _prediction_to_probabilities(
11121176
# but not desirable if we want to upload to OpenML).
11131177

11141178
model_copy = sklearn.base.clone(model, safe=True)
1179+
# security check
1180+
self._prevent_optimize_n_jobs(model_copy)
11151181
# Runtime can be measured if the model is run sequentially
1116-
can_measure_runtime = self._check_n_jobs(model_copy)
1182+
can_measure_cputime = self._can_measure_cputime(model_copy)
1183+
can_measure_wallclocktime = self._can_measure_wallclocktime(model_copy)
11171184

11181185
train_indices, test_indices = task.get_train_test_split_indices(
11191186
repeat=rep_no, fold=fold_no, sample=sample_no)
@@ -1133,17 +1200,29 @@ def _prediction_to_probabilities(
11331200

11341201
try:
11351202
# for measuring runtime. Only available since Python 3.3
1136-
if can_measure_runtime:
1137-
modelfit_starttime = time.process_time()
1203+
modelfit_start_cputime = None
1204+
modelfit_duration_cputime = None
1205+
modelpredict_start_cputime = None
1206+
1207+
modelfit_start_walltime = None
1208+
modelfit_duration_walltime = None
1209+
modelpredict_start_walltime = None
1210+
if can_measure_cputime:
1211+
modelfit_start_cputime = time.process_time()
1212+
if can_measure_wallclocktime:
1213+
modelfit_start_walltime = time.time()
11381214

11391215
if isinstance(task, OpenMLSupervisedTask):
11401216
model_copy.fit(train_x, train_y)
11411217
elif isinstance(task, OpenMLClusteringTask):
11421218
model_copy.fit(train_x)
11431219

1144-
if can_measure_runtime:
1145-
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
1146-
user_defined_measures['usercpu_time_millis_training'] = modelfit_duration
1220+
if can_measure_cputime:
1221+
modelfit_duration_cputime = (time.process_time() - modelfit_start_cputime) * 1000
1222+
user_defined_measures['usercpu_time_millis_training'] = modelfit_duration_cputime
1223+
elif can_measure_wallclocktime:
1224+
modelfit_duration_walltime = (time.time() - modelfit_start_walltime) * 1000
1225+
user_defined_measures['wall_clock_time_millis_training'] = modelfit_duration_walltime
11471226

11481227
except AttributeError as e:
11491228
# typically happens when training a regressor on classification task
@@ -1169,17 +1248,24 @@ def _prediction_to_probabilities(
11691248
else:
11701249
model_classes = used_estimator.classes_
11711250

1172-
if can_measure_runtime:
1173-
modelpredict_starttime = time.process_time()
1251+
if can_measure_cputime:
1252+
modelpredict_start_cputime = time.process_time()
1253+
if can_measure_wallclocktime:
1254+
modelpredict_start_walltime = time.time()
11741255

11751256
# In supervised learning this returns the predictions for Y, in clustering
11761257
# it returns the clusters
11771258
pred_y = model_copy.predict(test_x)
11781259

1179-
if can_measure_runtime:
1180-
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
1181-
user_defined_measures['usercpu_time_millis_testing'] = modelpredict_duration
1182-
user_defined_measures['usercpu_time_millis'] = modelfit_duration + modelpredict_duration
1260+
if can_measure_cputime:
1261+
modelpredict_duration_cputime = (time.process_time() - modelpredict_start_cputime) * 1000
1262+
user_defined_measures['usercpu_time_millis_testing'] = modelpredict_duration_cputime
1263+
user_defined_measures['usercpu_time_millis'] = modelfit_duration_cputime + modelpredict_duration_cputime
1264+
if can_measure_wallclocktime:
1265+
modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000
1266+
user_defined_measures['wall_clock_time_millis_testing'] = modelpredict_duration_walltime
1267+
user_defined_measures['wall_clock_time_millis'] = modelfit_duration_walltime + \
1268+
modelpredict_duration_walltime
11831269

11841270
# add client-side calculated metrics. These is used on the server as
11851271
# consistency check, only useful for supervised tasks

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -964,14 +964,18 @@ def test_paralizable_check(self):
964964
illegal_param_dist)
965965
]
966966

967-
answers = [True, False, False, True, False, False, True, False]
967+
can_measure_cputime_answers = [True, False, False, True, False, False, True, False]
968+
can_measure_walltime_answers = [True, True, False, True, True, False, True, True]
968969

969-
for model, expected_answer in zip(legal_models, answers):
970-
self.assertEqual(self.extension._check_n_jobs(model), expected_answer)
970+
for model, allowed_cputime, allowed_walltime in zip(legal_models,
971+
can_measure_cputime_answers,
972+
can_measure_walltime_answers):
973+
self.assertEqual(self.extension._can_measure_cputime(model), allowed_cputime)
974+
self.assertEqual(self.extension._can_measure_wallclocktime(model), allowed_walltime)
971975

972976
for model in illegal_models:
973977
with self.assertRaises(PyOpenMLError):
974-
self.extension._check_n_jobs(model)
978+
self.extension._prevent_optimize_n_jobs(model)
975979

976980
def test__get_fn_arguments_with_defaults(self):
977981
if LooseVersion(sklearn.__version__) < "0.19":

0 commit comments

Comments
 (0)