model is paralizable

janvanrijn · janvanrijn · commit 6d13a757294a · 2017-04-21T15:37:01.000+02:00
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
@@ -1,6 +1,6 @@
 from .flow import OpenMLFlow
-from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
+from .sklearn_converter import sklearn_to_flow, flow_to_sklearn,  model_is_paralizable
 from .functions import get_flow, list_flows, flow_exists
 
 __all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
-           'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']
+           'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists', 'model_is_paralizable']
diff --git a/openml/flows/sklearn_converter.py b/openml/flows/sklearn_converter.py
@@ -536,6 +536,33 @@ def _serialize_cross_validator(o):
 
     return ret
 
+def model_is_paralizable(model):
+    def check(param_dict):
+        for param, value in param_dict.items():
+            # n_jobs is scikitlearn parameter for paralizing jobs
+            if 'n_jobs' in param.split('__')[-1]:
+                # 0 = illegal value (?), 1 = use one core,  n = use n cores
+                # -1 = use all available cores -> this makes it hard to
+                # measure runtime in a fair way
+                if value != 1:
+                    return False
+        return True
+
+    if not (isinstance(model, sklearn.base.BaseEstimator) or
+            isinstance(model, sklearn.model_selection._search.BaseSearchCV)):
+        raise ValueError('model should be BaseEstimator or BaseSearchCV')
+
+    # check the parameters for n_jobs
+    if check(model.get_params()) == False:
+        return False
+
+    # check if the njobs is not in the optimization trace
+    # this would be error by the user, so we can throw it as a courtesy
+    if isinstance(model, sklearn.model_selection._search.BaseSearchCV):
+        if check(model.get_params()) == False:
+            raise PyOpenMLError('openml-python should not be used to '
+                                'optimize the n_jobs parameter.')
+    return True
 
 def _deserialize_cross_validator(value, **kwargs):
     model_name = value['name']
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -10,7 +10,7 @@
 
 from ..exceptions import PyOpenMLError
 from .. import config
-from ..flows import sklearn_to_flow, get_flow, flow_exists
+from ..flows import sklearn_to_flow, get_flow, flow_exists, model_is_paralizable
 from ..setups import setup_exists
 from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..util import URLError, version_complies
@@ -160,6 +160,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
     user_defined_measures = defaultdict(lambda: defaultdict(dict))
 
     rep_no = 0
+    can_measure_runtime = version_complies(3, 3) and model_is_paralizable(model)
     # TODO use different iterator to only provide a single iterator (less
     # methods, less maintenance, less confusion)
     for rep in task.iterate_repeats():
@@ -174,11 +175,11 @@ def _run_task_get_arffcontent(model, task, class_labels):
 
             try:
                 # for measuring runtime. Only available since Python 3.3
-                if version_complies(3, 3):
+                if can_measure_runtime:
                     modelfit_starttime = time.process_time()
                 model_fold.fit(trainX, trainY)
 
-                if version_complies(3, 3):
+                if can_measure_runtime:
                     modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
                     user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
             except AttributeError as e:
@@ -192,12 +193,12 @@ def _run_task_get_arffcontent(model, task, class_labels):
             else:
                 model_classes = model_fold.classes_
 
-            if version_complies(3, 3):
+            if can_measure_runtime:
                 modelpredict_starttime = time.process_time()
             
             ProbaY = model_fold.predict_proba(testX)
             PredY = model_fold.predict(testX)
-            if version_complies(3, 3):
+            if can_measure_runtime:
                 modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
                 user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
                 user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
diff --git a/tests/test_flows/test_sklearn.py b/tests/test_flows/test_sklearn.py
@@ -25,7 +25,8 @@
 import sklearn.tree
 
 from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
-from openml.flows.sklearn_converter import _format_external_version, _check_dependencies
+from openml.flows.sklearn_converter import _format_external_version, \
+    _check_dependencies, model_is_paralizable
 from openml.exceptions import PyOpenMLError
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -555,3 +556,18 @@ def test_illegal_parameter_names_featureunion(self):
             ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
         ]
         self.assertRaises(ValueError, sklearn.pipeline.FeatureUnion, transformer_list=transformer_list)
+
+    def test_paralizable_check(self):
+        models = [
+            sklearn.ensemble.RandomForestClassifier(),
+            sklearn.ensemble.RandomForestClassifier(n_jobs=5),
+            sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
+            sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=1))]),
+            sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=5))]),
+            sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=-1))])
+        ]
+
+        answers = [True, False, False, True, False, False]
+
+        for i in range(len(models)):
+            assert(model_is_paralizable(models[i]) == answers[i])