work on comments from Jan

mfeurer · mfeurer · commit faf5b261407d · 2017-05-18T14:41:44.000+02:00
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -324,10 +324,15 @@ def _from_dict(cls, xml_dict):
         arguments['model'] = None
         flow = cls(**arguments)
 
-        if arguments['external_version'].startswith('sklearn'):
+        # try to parse to a model because not everything that can be
+        # deserialized has to come from scikit-learn. If it can't be
+        # serialized, but comes from scikit-learn this is worth an exception
+        try:
             from .sklearn_converter import flow_to_sklearn
             model = flow_to_sklearn(flow)
-        else:
+        except Exception as e:
+            if arguments['external_version'].startswith('sklearn'):
+                raise e
             model = None
         flow.model = model
 
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -29,6 +29,8 @@
 
 def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
                       seed=None):
+    """See ``run_flow_on_task for a documentation."""
+
     flow = sklearn_to_flow(model)
 
     return run_flow_on_task(task=task, flow=flow,
@@ -38,23 +40,29 @@ def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
 
 def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
                      seed=None):
-    """Performs a CV run on the dataset of the given task, using the split.
+    """Run the model provided by the flow on the dataset defined by task.
+
+    Takes the flow and repeat information into account. In case a flow is not
+    yet published, it is published after executing the run (requires
+    internet connection).
 
     Parameters
     ----------
     task : OpenMLTask
         Task to perform.
     model : sklearn model
-        a model which has a function fit(X,Y) and predict(X),
+        A model which has a function fit(X,Y) and predict(X),
         all supervised estimators of scikit learn follow this definition of a model [1]
         [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
     avoid_duplicate_runs : bool
-        if this flag is set to True, the run will throw an error if the
-        setup/task combination is already present on the server.
+        If this flag is set to True, the run will throw an error if the
+        setup/task combination is already present on the server. Works only
+        if the flow is already published on the server. This feature requires an
+        internet connection.
     flow_tags : list(str)
-        a list of tags that the flow should have at creation
+        A list of tags that the flow should have at creation.
     seed: int
-        the models that are not seeded will get this seed
+        Models that are not seeded will get this seed.
 
     Returns
     -------
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -190,6 +190,17 @@ def extract_parameters(_flow, _flow_dict, component_model,
             # _flow is openml flow object, _param dict maps from flow name to flow id
             # for the main call, the param dict can be overridden (useful for unit tests / sentinels)
             # this way, for flows without subflows we do not have to rely on _flow_dict
+            expected_parameters = set(_flow.parameters)
+            expected_components = set(_flow.components)
+            model_parameters = set([mp for mp in component_model.get_params()
+                                    if '__' not in mp])
+            if len((expected_parameters | expected_components) ^ model_parameters) != 0:
+                raise ValueError('Parameters of the model do not match the '
+                                 'parameters expected by the '
+                                 'flow:\nexpected flow parameters: '
+                                 '%s\nmodel parameters: %s' % (
+                    sorted(expected_parameters| expected_components), sorted(model_parameters)))
+
             _params = []
             for _param_name in _flow.parameters:
                 _current = OrderedDict()
@@ -198,7 +209,9 @@ def extract_parameters(_flow, _flow_dict, component_model,
                 _tmp = openml.flows.sklearn_to_flow(
                     component_model.get_params()[_param_name])
 
-                # Try to filter out components which are handled further down!
+                # Try to filter out components (a.k.a. subflows) which are
+                # handled further down in the code (by recursively calling
+                # this function)!
                 if isinstance(_tmp, openml.flows.OpenMLFlow):
                     continue
                 try:
@@ -210,7 +223,19 @@ def extract_parameters(_flow, _flow_dict, component_model,
                     # Object of type 'OpenMLFlow' is not JSON serializable
                     if 'OpenMLFlow' in e.args[0] and \
                             'is not JSON serializable' in e.args[0]:
+                        # Additional check that the parameter that could not
+                        # be parsed is actually a list/tuple which is used
+                        # inside a feature union or pipeline
+                        if not isinstance(_tmp, (list, tuple)):
+                            raise e
+                        for step_name, step in _tmp:
+                            if isinstance(step_name, openml.flows.OpenMLFlow):
+                                raise e
+                            elif not isinstance(step, openml.flows.OpenMLFlow):
+                                raise e
                         continue
+                    else:
+                        raise e
 
                 _current['oml:value'] = _tmp
                 if _main_call:
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -110,8 +110,14 @@ def _reconstruct_flow(_flow, _params):
         # dicts, mapping from flow id to param name to param value
         # (obtained by using the subfunction _to_dict_of_dicts)
         for _param in _flow.parameters:
+            # It can happen that no parameters of a flow are in a setup,
+            # then the flow_id is not in _params; usually happens for a
+            # sklearn.pipeline.Pipeline object, where the steps parameter is
+            # not in the setup
             if _flow.flow_id not in _params:
                 continue
+            # It is not guaranteed that a setup on OpenML has all parameter
+            # settings of a flow, thus a param must not be in _params!
             if _param not in _params[_flow.flow_id]:
                 continue
             _flow.parameters[_param] = _params[_flow.flow_id][_param]
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -229,16 +229,16 @@ def test_run_and_upload(self):
         num_iterations = 5 # for base search classifiers
 
         clfs = []
-        random_state_values = []
+        random_state_fixtures = []
 
         lr = LogisticRegression()
         clfs.append(lr)
-        random_state_values.append('62501')
+        random_state_fixtures.append('62501')
 
         pipeline1 = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
                                     ('dummy', DummyClassifier(strategy='prior'))])
         clfs.append(pipeline1)
-        random_state_values.append('62501')
+        random_state_fixtures.append('62501')
 
         pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                     ('VarianceThreshold', VarianceThreshold()),
@@ -248,13 +248,13 @@ def test_run_and_upload(self):
                                          'min_samples_leaf': [2 ** x for x in range(0, 6 + 1)]},
                                         cv=3, n_iter=10))])
         clfs.append(pipeline2)
-        random_state_values.append('62501')
+        random_state_fixtures.append('62501')
 
         gridsearch = GridSearchCV(BaggingClassifier(base_estimator=SVC()),
                                   {"base_estimator__C": [0.01, 0.1, 10],
                                    "base_estimator__gamma": [0.01, 0.1, 10]})
         clfs.append(gridsearch)
-        random_state_values.append('62501')
+        random_state_fixtures.append('62501')
 
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -271,9 +271,9 @@ def test_run_and_upload(self):
         # The random states for the RandomizedSearchCV is set after the
         # random state of the RandomForestClassifier is set, therefore,
         # it has a different value than the other examples before
-        random_state_values.append('33003')
+        random_state_fixtures.append('33003')
 
-        for clf, rsv in zip(clfs, random_state_values):
+        for clf, rsv in zip(clfs, random_state_fixtures):
             run = self._perform_run(task_id, num_test_instances, clf,
                                     random_state_value=rsv)
             if isinstance(clf, BaseSearchCV):
@@ -311,7 +311,6 @@ def test_initialize_model_from_run(self):
 
         self.assertEquals(flowS.components['Imputer'].parameters['strategy'], '"median"')
         self.assertEquals(flowS.components['VarianceThreshold'].parameters['threshold'], '0.05')
-        pass
 
     def test_get_run_trace(self):
         # get_run_trace is already tested implicitly in test_run_and_publish