Merge branch 'develop' into add/#145

janvanrijn · web-flow · commit f6b68d19ae84 · 2017-04-26T17:16:21.000+02:00
diff --git a/openml/__init__.py b/openml/__init__.py
@@ -18,15 +18,52 @@
 
 from .datasets import OpenMLDataset, OpenMLDataFeature
 from . import datasets
+from . import tasks
 from . import runs
 from . import flows
 from .runs import OpenMLRun
 from .tasks import OpenMLTask, OpenMLSplit
 from .flows import OpenMLFlow
 
+__version__ = "0.3.0"
+
+
+def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
+                   run_ids=None):
+    """
+    Populate a cache for offline and parallel usage of the OpenML connector.
+
+    Parameters
+    ----------
+    task_ids : iterable
+
+    dataset_ids : iterable
+
+    flow_ids : iterable
+
+    run_ids : iterable
+
+    Returns
+    -------
+    None
+    """
+    if task_ids is not None:
+        for task_id in task_ids:
+            tasks.functions.get_task(task_id)
+
+    if dataset_ids is not None:
+        for dataset_id in dataset_ids:
+            datasets.functions.get_dataset(dataset_id)
+
+    if flow_ids is not None:
+        for flow_id in flow_ids:
+            flows.functions.get_flow(flow_id)
+
+    if run_ids is not None:
+        for run_id in run_ids:
+            runs.functions.get_run(run_id)
 
-__version__ = "0.2.1"
 
 __all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
            'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
-           'config', 'runs', 'flows']
+           'config', 'runs', 'flows', 'tasks']
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -110,7 +110,9 @@ def _parse_server_exception(response):
     try:
         server_exception = xmltodict.parse(response.text)
     except:
-        raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
+        raise OpenMLServerError(('Unexpected server error. Please '
+                                 'contact the developers!\nStatus code: '
+                                 '%d\n' % response.status_code) + response.text)
 
     code = int(server_exception['oml:error']['oml:code'])
     message = server_exception['oml:error']['oml:message']
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
@@ -1,5 +1,6 @@
 from .flow import OpenMLFlow
-from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
+
+from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, _check_n_jobs
 from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
 
 __all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
diff --git a/openml/flows/sklearn_converter.py b/openml/flows/sklearn_converter.py
@@ -544,6 +544,44 @@ def _serialize_cross_validator(o):
 
     return ret
 
+def _check_n_jobs(model):
+    '''
+    Returns True if the parameter settings of model are chosen s.t. the model
+     will run on a single core (in that case, openml-python can measure runtimes)
+    '''
+    def check(param_dict, disallow_parameter=False):
+        for param, value in param_dict.items():
+            # n_jobs is scikitlearn parameter for paralizing jobs
+            if param.split('__')[-1] == 'n_jobs':
+                # 0 = illegal value (?), 1 = use one core,  n = use n cores
+                # -1 = use all available cores -> this makes it hard to
+                # measure runtime in a fair way
+                if value != 1 or disallow_parameter:
+                    return False
+        return True
+
+    if not (isinstance(model, sklearn.base.BaseEstimator) or
+            isinstance(model, sklearn.model_selection._search.BaseSearchCV)):
+        raise ValueError('model should be BaseEstimator or BaseSearchCV')
+
+    # make sure that n_jobs is not in the parameter grid of optimization procedure
+    if isinstance(model, sklearn.model_selection._search.BaseSearchCV):
+        param_distributions = None
+        if isinstance(model, sklearn.model_selection.GridSearchCV):
+            param_distributions = model.param_grid
+        elif isinstance(model, sklearn.model_selection.RandomizedSearchCV):
+            param_distributions = model.param_distributions
+        else:
+            print('Warning! Using subclass BaseSearchCV other than ' \
+                  '{GridSearchCV, RandomizedSearchCV}. Should implement param check. ')
+            pass
+
+        if not check(param_distributions, True):
+            raise PyOpenMLError('openml-python should not be used to '
+                                'optimize the n_jobs parameter.')
+
+    # check the parameters for n_jobs
+    return check(model.get_params(), False)
 
 def _deserialize_cross_validator(value, **kwargs):
     model_name = value['name']
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -10,7 +10,7 @@
 
 from ..exceptions import PyOpenMLError
 from .. import config
-from ..flows import sklearn_to_flow, get_flow, flow_exists
+from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
 from ..setups import setup_exists
 from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..util import URLError, version_complies
@@ -160,6 +160,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
     user_defined_measures = defaultdict(lambda: defaultdict(dict))
 
     rep_no = 0
+    can_measure_runtime = version_complies(3, 3) and _check_n_jobs(model)
     # TODO use different iterator to only provide a single iterator (less
     # methods, less maintenance, less confusion)
     for rep in task.iterate_repeats():
@@ -174,11 +175,11 @@ def _run_task_get_arffcontent(model, task, class_labels):
 
             try:
                 # for measuring runtime. Only available since Python 3.3
-                if version_complies(3, 3):
+                if can_measure_runtime:
                     modelfit_starttime = time.process_time()
                 model_fold.fit(trainX, trainY)
 
-                if version_complies(3, 3):
+                if can_measure_runtime:
                     modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
                     user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
             except AttributeError as e:
@@ -192,12 +193,12 @@ def _run_task_get_arffcontent(model, task, class_labels):
             else:
                 model_classes = model_fold.classes_
 
-            if version_complies(3, 3):
+            if can_measure_runtime:
                 modelpredict_starttime = time.process_time()
             
             ProbaY = model_fold.predict_proba(testX)
             PredY = model_fold.predict(testX)
-            if version_complies(3, 3):
+            if can_measure_runtime:
                 modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
                 user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
                 user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -235,6 +235,11 @@ def _create_task_from_xml(xml):
         name = input_["@name"]
         inputs[name] = input_
 
+    evaluation_measures = None
+    if 'evaluation_measures' in inputs:
+        evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"]["oml:evaluation_measure"]
+
+
     # Convert some more parameters
     for parameter in \
             inputs["estimation_procedure"]["oml:estimation_procedure"][
@@ -251,5 +256,4 @@ def _create_task_from_xml(xml):
             "oml:type"],
         inputs["estimation_procedure"]["oml:estimation_procedure"][
             "oml:data_splits_url"], estimation_parameters,
-        inputs["evaluation_measures"]["oml:evaluation_measures"][
-            "oml:evaluation_measure"], None)
+        evaluation_measures, None)
diff --git a/tests/test_flows/test_sklearn.py b/tests/test_flows/test_sklearn.py
@@ -25,8 +25,10 @@
 import sklearn.tree
 
 from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
+
 from openml.flows.functions import assert_flows_equal
-from openml.flows.sklearn_converter import _format_external_version, _check_dependencies
+from openml.flows.sklearn_converter import _format_external_version, \
+    _check_dependencies, _check_n_jobs
 from openml.exceptions import PyOpenMLError
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -555,3 +557,36 @@ def test_illegal_parameter_names_featureunion(self):
             ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
         ]
         self.assertRaises(ValueError, sklearn.pipeline.FeatureUnion, transformer_list=transformer_list)
+
+    def test_paralizable_check(self):
+        # using this model should pass the test (if param distribution is legal)
+        singlecore_bagging = sklearn.ensemble.BaggingClassifier()
+        # using this model should return false (if param distribution is legal)
+        multicore_bagging = sklearn.ensemble.BaggingClassifier(n_jobs=5)
+        # using this param distribution should raise an exception
+        illegal_param_dist = {"base__n_jobs": [-1, 0, 1] }
+        # using this param distribution should not raise an exception
+        legal_param_dist = {"base__max_depth": [2, 3, 4]}
+
+        legal_models = [
+            sklearn.ensemble.RandomForestClassifier(),
+            sklearn.ensemble.RandomForestClassifier(n_jobs=5),
+            sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
+            sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=1))]),
+            sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=5))]),
+            sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=-1))]),
+            sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist),
+            sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist)
+        ]
+        illegal_models = [
+            sklearn.model_selection.GridSearchCV(singlecore_bagging, illegal_param_dist),
+            sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist)
+        ]
+
+        answers = [True, False, False, True, False, False, True, False]
+
+        for i in range(len(legal_models)):
+            self.assertTrue(_check_n_jobs(legal_models[i]) == answers[i])
+
+        for i in range(len(illegal_models)):
+            self.assertRaises(PyOpenMLError, _check_n_jobs, illegal_models[i])
diff --git a/tests/test_openml/__init__.py b/tests/test_openml/__init__.py
diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py
@@ -0,0 +1,38 @@
+import sys
+
+if sys.version_info[0] >= 3:
+    from unittest import mock
+else:
+    import mock
+
+import six
+
+from openml.testing import TestBase
+import openml
+
+
+class TestInit(TestBase):
+
+    @mock.patch('openml.tasks.functions.get_task')
+    @mock.patch('openml.datasets.functions.get_dataset')
+    @mock.patch('openml.flows.functions.get_flow')
+    @mock.patch('openml.runs.functions.get_run')
+    def test_populate_cache(self, run_mock, flow_mock, dataset_mock, task_mock):
+        openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4],
+                              flow_ids=[5, 6], run_ids=[7, 8])
+        self.assertEqual(run_mock.call_count, 2)
+        for argument, fixture in six.moves.zip(run_mock.call_args_list, [(7,), (8,)]):
+            self.assertEqual(argument[0], fixture)
+
+        self.assertEqual(flow_mock.call_count, 2)
+        for argument, fixture in six.moves.zip(flow_mock.call_args_list, [(5,), (6,)]):
+            self.assertEqual(argument[0], fixture)
+
+        self.assertEqual(dataset_mock.call_count, 2)
+        for argument, fixture in six.moves.zip(dataset_mock.call_args_list, [(3,), (4,)]):
+            self.assertEqual(argument[0], fixture)
+
+        self.assertEqual(task_mock.call_count, 2)
+        for argument, fixture in six.moves.zip(task_mock.call_args_list, [(1,), (2,)]):
+            self.assertEqual(argument[0], fixture)
+