make unit test stricter

mfeurer · mfeurer · commit 9d6bea1cf3c1 · 2017-05-16T16:24:02.000+02:00
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -127,6 +127,22 @@ def _list_flows(api_call):
     return flows
 
 
+def _check_flow_for_server_id(flow):
+    """Check if the given flow and it's components have a flow_id."""
+
+    # Depth-first search to check if all components were uploaded to the
+    # server before parsing the parameters
+    stack = list()
+    stack.append(flow)
+    while len(stack) > 0:
+        current = stack.pop()
+        if current.flow_id is None:
+            raise ValueError("Flow %s has no flow_id!" % current.name)
+        else:
+            for component in current.components.values():
+                stack.append(component)
+
+
 def assert_flows_equal(flow1, flow2):
     """Check equality of two flows.
 
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -160,36 +160,33 @@ def _create_description_xml(self):
         return description_xml
 
     @staticmethod
-    def _parse_parameters(flow):
+    def _parse_parameters(flow, model=None):
         """Extracts all parameter settings from the model inside a flow in
         OpenML format.
 
         Parameters
         ----------
-        flow
+        flow : OpenMLFlow
             openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
 
+        model : BaseEstimator, optional
+            If not given, the parameters are extracted from ``flow.model``.
+
         """
 
-        # Depth-first search to check if all components were uploaded to the
-        # server before parsing the parameters
-        stack = list()
-        stack.append(flow)
-        while len(stack) > 0:
-            current = stack.pop()
-            if current.flow_id is None:
-                raise ValueError("Flow %s has no flow_id!" % current.name)
-            else:
-                for component in current.components.values():
-                    stack.append(component)
+        if model is None:
+            model = flow.model
+
+        openml.flows.functions._check_flow_for_server_id(flow)
 
         def get_flow_dict(_flow):
             flow_map = {_flow.name: _flow.flow_id}
             for subflow in _flow.components:
                 flow_map.update(get_flow_dict(_flow.components[subflow]))
             return flow_map
 
-        def extract_parameters(_flow, _flow_dict, _main_call=False, main_id=None):
+        def extract_parameters(_flow, _flow_dict, component_model,
+                               _main_call=False, main_id=None):
             # _flow is openml flow object, _param dict maps from flow name to flow id
             # for the main call, the param dict can be overridden (useful for unit tests / sentinels)
             # this way, for flows without subflows we do not have to rely on _flow_dict
@@ -198,7 +195,8 @@ def extract_parameters(_flow, _flow_dict, _main_call=False, main_id=None):
                 _current = OrderedDict()
                 _current['oml:name'] = _param_name
 
-                _tmp = openml.flows.sklearn_to_flow(_flow.model.get_params()[_param_name])
+                _tmp = openml.flows.sklearn_to_flow(
+                    component_model.get_params()[_param_name])
 
                 # Try to filter out components which are handled further down!
                 if isinstance(_tmp, openml.flows.OpenMLFlow):
@@ -222,14 +220,18 @@ def extract_parameters(_flow, _flow_dict, _main_call=False, main_id=None):
                 _params.append(_current)
 
             for _identifier in _flow.components:
-                _params.extend(extract_parameters(_flow.components[_identifier], _flow_dict))
+                subcomponent_model = component_model.get_params()[_identifier]
+                _params.extend(extract_parameters(_flow.components[_identifier],
+                                                  _flow_dict, subcomponent_model))
             return _params
 
         flow_dict = get_flow_dict(flow)
-        parameters = extract_parameters(flow, flow_dict, True, flow.flow_id)
+        parameters = extract_parameters(flow, flow_dict, model,
+                                        True, flow.flow_id)
 
         return parameters
 
+
 ################################################################################
 # Functions which cannot be in runs/functions due to circular imports
 
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -2,22 +2,25 @@
 
 import openml
 import xmltodict
-import copy
 
 from .setup import OpenMLSetup, OpenMLParameter
+from openml.flows import sklearn_to_flow, flow_exists
 
-def setup_exists(flow):
+
+def setup_exists(flow, model=None):
     '''
-    Checks whether a flow / hyperparameter configuration already exists on the server
+    Checks whether a hyperparameter configuration already exists on the server.
 
     Parameter
     ---------
 
     flow : flow
-        the openml flow object (should be downloaded from server)
-    sklearn_model : BaseEstimator
-        The base estimator that was used to create the flow. Will
-         be used to extract parameter settings from.
+        The openml flow object.
+
+    sklearn_model : BaseEstimator, optional
+        If given, the parameters are parsed from this model instead of the
+        model in the flow. If not given, parameters are parsed from
+        ``flow.model``.
 
     Returns
     -------
@@ -26,7 +29,18 @@ def setup_exists(flow):
     '''
 
     # sadly, this api call relies on a run object
-    openml_param_settings = openml.runs.OpenMLRun._parse_parameters(flow)
+    openml.flows.functions._check_flow_for_server_id(flow)
+
+    if model is None:
+        model = flow.model
+    else:
+        converted_flow = sklearn_to_flow(model)
+        exists = flow_exists(converted_flow.name,
+                             converted_flow.external_version)
+        if exists != flow.flow_id:
+            raise ValueError('This should not happen!')
+
+    openml_param_settings = openml.runs.OpenMLRun._parse_parameters(flow, model)
     description = xmltodict.unparse(_to_dict(flow.flow_id,
                                              openml_param_settings),
                                     pretty=True)
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -0,0 +1,49 @@
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
+
+from openml.testing import TestBase
+from openml.flows.sklearn_converter import sklearn_to_flow
+from openml import OpenMLRun
+
+
+class TestRun(TestBase):
+
+    def test_parse_parameters_flow_not_on_server(self):
+
+        model = LogisticRegression()
+        flow = sklearn_to_flow(model)
+        self.assertRaisesRegexp(ValueError,
+                                'Flow sklearn.linear_model.logistic.LogisticRegression '
+                                'has no flow_id!',
+                                OpenMLRun._parse_parameters, flow)
+
+        model = AdaBoostClassifier(base_estimator=LogisticRegression())
+        flow = sklearn_to_flow(model)
+        flow.flow_id = 1
+        self.assertRaisesRegexp(ValueError,
+                                'Flow sklearn.linear_model.logistic.LogisticRegression '
+                                'has no flow_id!',
+                                OpenMLRun._parse_parameters, flow)
+
+    def test_parse_parameters(self):
+
+        model = RandomizedSearchCV(
+            estimator=RandomForestClassifier(n_estimators=5),
+            param_distributions={"max_depth": [3, None],
+                                 "max_features": [1, 2, 3, 4],
+                                 "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+                                 "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                                 "bootstrap": [True, False],
+                                 "criterion": ["gini", "entropy"]},
+            cv=StratifiedKFold(n_splits=2, random_state=1),
+            n_iter=5)
+        flow = sklearn_to_flow(model)
+        flow.flow_id = 1
+        flow.components['estimator'].flow_id = 2
+        parameters = OpenMLRun._parse_parameters(flow)
+        for parameter in parameters:
+            self.assertIsNotNone(parameter['oml:component'], msg=parameter)
+            if parameter['oml:name'] == 'n_estimators':
+                self.assertEqual(parameter['oml:value'], '5')
+                self.assertEqual(parameter['oml:component'], 2)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -32,11 +32,6 @@
     StratifiedKFold
 from sklearn.pipeline import Pipeline
 
-if sys.version_info[0] >= 3:
-    from unittest import mock
-else:
-    import mock
-
 
 class TestRun(TestBase):
 
@@ -219,34 +214,50 @@ def test_run_and_upload(self):
         num_folds = 1 # because of holdout
         num_iterations = 5 # for base search classifiers
 
-        clfs = [LogisticRegression(),
-                Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
-                                ('dummy', DummyClassifier(strategy='prior'))]),
-                Pipeline(steps=[('Imputer', Imputer(strategy='median')),
-                                ('VarianceThreshold', VarianceThreshold()),
-                                ('Estimator', RandomizedSearchCV(
-                                    DecisionTreeClassifier(),
-                                    {'min_samples_split': [2 ** x for x in range(1, 7 + 1)],
-                                     'min_samples_leaf': [2 ** x for x in range(0, 6 + 1)]},
-                                    cv=3, n_iter=10))]),
-                GridSearchCV(BaggingClassifier(base_estimator=SVC()),
-                             {"base_estimator__C": [0.01, 0.1, 10],
-                              "base_estimator__gamma": [0.01, 0.1, 10]}),
-                RandomizedSearchCV(RandomForestClassifier(n_estimators=5),
-                                   {"max_depth": [3, None],
-                                    "max_features": [1, 2, 3, 4],
-                                    "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-                                    "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                                    "bootstrap": [True, False],
-                                    "criterion": ["gini", "entropy"]},
-                                    cv=StratifiedKFold(n_splits=2,
-                                                       random_state=1),
-                                    n_iter=num_iterations)]
-
+        clfs = []
+        random_state_values = []
+
+        lr = LogisticRegression()
+        clfs.append(lr)
+        random_state_values.append('62501')
+
+        pipeline1 = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
+                                    ('dummy', DummyClassifier(strategy='prior'))])
+        clfs.append(pipeline1)
+        random_state_values.append('62501')
+
+        pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
+                                    ('VarianceThreshold', VarianceThreshold()),
+                                    ('Estimator', RandomizedSearchCV(
+                                        DecisionTreeClassifier(),
+                                        {'min_samples_split': [2 ** x for x in range(1, 7 + 1)],
+                                         'min_samples_leaf': [2 ** x for x in range(0, 6 + 1)]},
+                                        cv=3, n_iter=10))])
+        clfs.append(pipeline2)
+        random_state_values.append('62501')
+
+        gridsearch = GridSearchCV(BaggingClassifier(base_estimator=SVC()),
+                                  {"base_estimator__C": [0.01, 0.1, 10],
+                                   "base_estimator__gamma": [0.01, 0.1, 10]})
+        clfs.append(gridsearch)
+        random_state_values.append('62501')
+
+        randomsearch = RandomizedSearchCV(
+            RandomForestClassifier(n_estimators=5),
+            {"max_depth": [3, None],
+             "max_features": [1, 2, 3, 4],
+             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+             "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+             "bootstrap": [True, False],
+             "criterion": ["gini", "entropy"]},
+            cv=StratifiedKFold(n_splits=2, random_state=1),
+            n_iter=num_iterations)
+
+        clfs.append(randomsearch)
         # The random states for the RandomizedSearchCV is set after the
         # random state of the RandomForestClassifier is set, therefore,
         # it has a different value than the other examples before
-        random_state_values = ['62501'] * (len(clfs) - 1) + ['33003']
+        random_state_values.append('33003')
 
         for clf, rsv in zip(clfs, random_state_values):
             run = self._perform_run(task_id, num_test_instances, clf,
@@ -333,12 +344,11 @@ def test__run_exists(self):
         # and can just check their status on line
         clfs = [sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='mean')),
                                                 ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
-                                                ('Estimator', GaussianNB())]),
+                                                ('Estimator', DecisionTreeClassifier(max_depth=4))]),
                 sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='most_frequent')),
                                                  ('VarianceThreshold', VarianceThreshold(threshold=0.1)),
                                                  ('Estimator', DecisionTreeClassifier(max_depth=4))])]
 
-
         task = openml.tasks.get_task(115)
 
         for clf in clfs:
@@ -347,18 +357,18 @@ def test__run_exists(self):
                 # skip run if it was already performed.
                 run = openml.runs.run_model_on_task(task, clf, avoid_duplicate_runs=True)
                 run.publish()
-            except openml.exceptions.PyOpenMLError:
+            except openml.exceptions.PyOpenMLError as e:
                 # run already existed. Great.
                 pass
 
             flow = openml.flows.sklearn_to_flow(clf)
             flow_exists = openml.flows.flow_exists(flow.name, flow.external_version)
-            self.assertIsInstance(flow_exists, int)
+            self.assertGreater(flow_exists, 0)
             downloaded_flow = openml.flows.get_flow(flow_exists)
-            setup_exists = openml.setups.setup_exists(downloaded_flow)
-            self.assertIsInstance(setup_exists, int)
+            setup_exists = openml.setups.setup_exists(downloaded_flow, clf)
+            self.assertGreater(setup_exists, 0)
             run_ids = _run_exists(task.task_id, setup_exists)
-            self.assertGreater(len(run_ids), 0)
+            self.assertTrue(run_ids, msg=(run_ids, clf))
 
     def test__get_seeded_model(self):
         # randomized models that are initialized without seeds, can be seeded