Skip to content

Commit 9a0d9a8

Browse files
authored
Merge pull request #230 from openml/fix229
check if model is paralizable
2 parents 058cfa1 + 239bf41 commit 9a0d9a8

4 files changed

Lines changed: 80 additions & 7 deletions

File tree

openml/flows/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .flow import OpenMLFlow
2-
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
2+
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, _check_n_jobs
33
from .functions import get_flow, list_flows, flow_exists
44

55
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',

openml/flows/sklearn_converter.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,44 @@ def _serialize_cross_validator(o):
536536

537537
return ret
538538

539+
def _check_n_jobs(model):
540+
'''
541+
Returns True if the parameter settings of model are chosen s.t. the model
542+
will run on a single core (in that case, openml-python can measure runtimes)
543+
'''
544+
def check(param_dict, disallow_parameter=False):
545+
for param, value in param_dict.items():
546+
# n_jobs is scikitlearn parameter for paralizing jobs
547+
if param.split('__')[-1] == 'n_jobs':
548+
# 0 = illegal value (?), 1 = use one core, n = use n cores
549+
# -1 = use all available cores -> this makes it hard to
550+
# measure runtime in a fair way
551+
if value != 1 or disallow_parameter:
552+
return False
553+
return True
554+
555+
if not (isinstance(model, sklearn.base.BaseEstimator) or
556+
isinstance(model, sklearn.model_selection._search.BaseSearchCV)):
557+
raise ValueError('model should be BaseEstimator or BaseSearchCV')
558+
559+
# make sure that n_jobs is not in the parameter grid of optimization procedure
560+
if isinstance(model, sklearn.model_selection._search.BaseSearchCV):
561+
param_distributions = None
562+
if isinstance(model, sklearn.model_selection.GridSearchCV):
563+
param_distributions = model.param_grid
564+
elif isinstance(model, sklearn.model_selection.RandomizedSearchCV):
565+
param_distributions = model.param_distributions
566+
else:
567+
print('Warning! Using subclass BaseSearchCV other than ' \
568+
'{GridSearchCV, RandomizedSearchCV}. Should implement param check. ')
569+
pass
570+
571+
if not check(param_distributions, True):
572+
raise PyOpenMLError('openml-python should not be used to '
573+
'optimize the n_jobs parameter.')
574+
575+
# check the parameters for n_jobs
576+
return check(model.get_params(), False)
539577

540578
def _deserialize_cross_validator(value, **kwargs):
541579
model_name = value['name']

openml/runs/functions.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from ..exceptions import PyOpenMLError
1212
from .. import config
13-
from ..flows import sklearn_to_flow, get_flow, flow_exists
13+
from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
1414
from ..setups import setup_exists
1515
from ..exceptions import OpenMLCacheException, OpenMLServerException
1616
from ..util import URLError, version_complies
@@ -160,6 +160,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
160160
user_defined_measures = defaultdict(lambda: defaultdict(dict))
161161

162162
rep_no = 0
163+
can_measure_runtime = version_complies(3, 3) and _check_n_jobs(model)
163164
# TODO use different iterator to only provide a single iterator (less
164165
# methods, less maintenance, less confusion)
165166
for rep in task.iterate_repeats():
@@ -174,11 +175,11 @@ def _run_task_get_arffcontent(model, task, class_labels):
174175

175176
try:
176177
# for measuring runtime. Only available since Python 3.3
177-
if version_complies(3, 3):
178+
if can_measure_runtime:
178179
modelfit_starttime = time.process_time()
179180
model_fold.fit(trainX, trainY)
180181

181-
if version_complies(3, 3):
182+
if can_measure_runtime:
182183
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
183184
user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
184185
except AttributeError as e:
@@ -192,12 +193,12 @@ def _run_task_get_arffcontent(model, task, class_labels):
192193
else:
193194
model_classes = model_fold.classes_
194195

195-
if version_complies(3, 3):
196+
if can_measure_runtime:
196197
modelpredict_starttime = time.process_time()
197198

198199
ProbaY = model_fold.predict_proba(testX)
199200
PredY = model_fold.predict(testX)
200-
if version_complies(3, 3):
201+
if can_measure_runtime:
201202
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
202203
user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
203204
user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration

tests/test_flows/test_sklearn.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
import sklearn.tree
2626

2727
from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
28-
from openml.flows.sklearn_converter import _format_external_version, _check_dependencies
28+
from openml.flows.sklearn_converter import _format_external_version, \
29+
_check_dependencies, _check_n_jobs
2930
from openml.exceptions import PyOpenMLError
3031

3132
this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -555,3 +556,36 @@ def test_illegal_parameter_names_featureunion(self):
555556
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
556557
]
557558
self.assertRaises(ValueError, sklearn.pipeline.FeatureUnion, transformer_list=transformer_list)
559+
560+
def test_paralizable_check(self):
561+
# using this model should pass the test (if param distribution is legal)
562+
singlecore_bagging = sklearn.ensemble.BaggingClassifier()
563+
# using this model should return false (if param distribution is legal)
564+
multicore_bagging = sklearn.ensemble.BaggingClassifier(n_jobs=5)
565+
# using this param distribution should raise an exception
566+
illegal_param_dist = {"base__n_jobs": [-1, 0, 1] }
567+
# using this param distribution should not raise an exception
568+
legal_param_dist = {"base__max_depth": [2, 3, 4]}
569+
570+
legal_models = [
571+
sklearn.ensemble.RandomForestClassifier(),
572+
sklearn.ensemble.RandomForestClassifier(n_jobs=5),
573+
sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
574+
sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=1))]),
575+
sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=5))]),
576+
sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=-1))]),
577+
sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist),
578+
sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist)
579+
]
580+
illegal_models = [
581+
sklearn.model_selection.GridSearchCV(singlecore_bagging, illegal_param_dist),
582+
sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist)
583+
]
584+
585+
answers = [True, False, False, True, False, False, True, False]
586+
587+
for i in range(len(legal_models)):
588+
self.assertTrue(_check_n_jobs(legal_models[i]) == answers[i])
589+
590+
for i in range(len(illegal_models)):
591+
self.assertRaises(PyOpenMLError, _check_n_jobs, illegal_models[i])

0 commit comments

Comments
 (0)