To handle non-actionable steps in sklearn (#866)

Neeratyoy · mfeurer · commit 2b7e740d8c3d · 2019-11-18T15:51:48.000+01:00
* Initial changes to handle reproducible example from the issue

* Making tentative changes; Need to test deserialization

* Fixing deserialization when empty steps in sklearn model

* Fixing flake issues, failing test cases

* Fixing test cases

* Dropping support for 'None' as sklearn estimator

* Adding test case for None estimator
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -696,10 +696,14 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         # will be part of the name (in brackets)
         sub_components_names = ""
         for key in subcomponents:
+            if isinstance(subcomponents[key], OpenMLFlow):
+                name = subcomponents[key].name
+            elif isinstance(subcomponents[key], str):  # 'drop', 'passthrough' can be passed
+                name = subcomponents[key]
             if key in subcomponents_explicit:
-                sub_components_names += "," + key + "=" + subcomponents[key].name
+                sub_components_names += "," + key + "=" + name
             else:
-                sub_components_names += "," + subcomponents[key].name
+                sub_components_names += "," + name
 
         if sub_components_names:
             # slice operation on string in order to get rid of leading comma
@@ -771,6 +775,9 @@ def _get_external_version_string(
         external_versions.add(openml_version)
         external_versions.add(sklearn_version)
         for visitee in sub_components.values():
+            # 'drop', 'passthrough', None can be passed as estimators
+            if isinstance(visitee, str):
+                continue
             for external_version in visitee.external_version.split(','):
                 external_versions.add(external_version)
         return ','.join(list(sorted(external_versions)))
@@ -783,9 +790,12 @@ def _check_multiple_occurence_of_component_in_flow(
         to_visit_stack = []  # type: List[OpenMLFlow]
         to_visit_stack.extend(sub_components.values())
         known_sub_components = set()  # type: Set[str]
+
         while len(to_visit_stack) > 0:
             visitee = to_visit_stack.pop()
-            if visitee.name in known_sub_components:
+            if isinstance(visitee, str):  # 'drop', 'passthrough' can be passed as estimators
+                known_sub_components.add(visitee)
+            elif visitee.name in known_sub_components:
                 raise ValueError('Found a second occurence of component %s when '
                                  'trying to serialize %s.' % (visitee.name, model))
             else:
@@ -822,7 +832,7 @@ def _extract_information_from_model(
             def flatten_all(list_):
                 """ Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """
                 for el in list_:
-                    if isinstance(el, (list, tuple)):
+                    if isinstance(el, (list, tuple)) and len(el) > 0:
                         yield from flatten_all(el)
                     else:
                         yield el
@@ -852,17 +862,31 @@ def flatten_all(list_):
                 parameter_value = list()  # type: List
                 reserved_keywords = set(model.get_params(deep=False).keys())
 
-                for sub_component_tuple in rval:
+                for i, sub_component_tuple in enumerate(rval):
                     identifier = sub_component_tuple[0]
                     sub_component = sub_component_tuple[1]
-                    sub_component_type = type(sub_component_tuple)
+                    # sub_component_type = type(sub_component_tuple)
                     if not 2 <= len(sub_component_tuple) <= 3:
                         # length 2 is for {VotingClassifier.estimators,
                         # Pipeline.steps, FeatureUnion.transformer_list}
                         # length 3 is for ColumnTransformer
                         msg = 'Length of tuple does not match assumptions'
                         raise ValueError(msg)
-                    if not isinstance(sub_component, (OpenMLFlow, type(None))):
+
+                    if isinstance(sub_component, str):
+                        if sub_component != 'drop' and sub_component != 'passthrough':
+                            msg = 'Second item of tuple does not match assumptions. ' \
+                                  'If string, can be only \'drop\' or \'passthrough\' but' \
+                                  'got %s' % sub_component
+                            raise ValueError(msg)
+                        else:
+                            pass
+                    elif isinstance(sub_component, type(None)):
+                        msg = 'Cannot serialize objects of None type. Please use a valid ' \
+                              'placeholder for None. Note that empty sklearn estimators can be '\
+                              'replaced with \'drop\' or \'passthrough\'.'
+                        raise ValueError(msg)
+                    elif not isinstance(sub_component, OpenMLFlow):
                         msg = 'Second item of tuple does not match assumptions. ' \
                               'Expected OpenMLFlow, got %s' % type(sub_component)
                         raise TypeError(msg)
@@ -875,31 +899,18 @@ def flatten_all(list_):
                                                         identifier)
                         raise PyOpenMLError(msg)
 
-                    if sub_component is None:
-                        # In a FeatureUnion it is legal to have a None step
-
-                        pv = [identifier, None]
-                        if sub_component_type is tuple:
-                            parameter_value.append(tuple(pv))
-                        else:
-                            parameter_value.append(pv)
-
-                    else:
-                        # Add the component to the list of components, add a
-                        # component reference as a placeholder to the list of
-                        # parameters, which will be replaced by the real component
-                        # when deserializing the parameter
-                        sub_components_explicit.add(identifier)
-                        sub_components[identifier] = sub_component
-                        component_reference = OrderedDict()  # type: Dict[str, Union[str, Dict]]
-                        component_reference['oml-python:serialized_object'] = 'component_reference'
-                        cr_value = OrderedDict()  # type: Dict[str, Any]
-                        cr_value['key'] = identifier
-                        cr_value['step_name'] = identifier
-                        if len(sub_component_tuple) == 3:
-                            cr_value['argument_1'] = sub_component_tuple[2]
-                        component_reference['value'] = cr_value
-                        parameter_value.append(component_reference)
+                    # when deserializing the parameter
+                    sub_components_explicit.add(identifier)
+                    sub_components[identifier] = sub_component
+                    component_reference = OrderedDict()  # type: Dict[str, Union[str, Dict]]
+                    component_reference['oml-python:serialized_object'] = 'component_reference'
+                    cr_value = OrderedDict()  # type: Dict[str, Any]
+                    cr_value['key'] = identifier
+                    cr_value['step_name'] = identifier
+                    if len(sub_component_tuple) == 3:
+                        cr_value['argument_1'] = sub_component_tuple[2]
+                    component_reference['value'] = cr_value
+                    parameter_value.append(component_reference)
 
                 # Here (and in the elif and else branch below) are the only
                 # places where we encode a value as json to make sure that all
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -30,7 +30,8 @@
 import sklearn.preprocessing
 import sklearn.tree
 import sklearn.cluster
-
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
 import openml
 from openml.extensions.sklearn import SklearnExtension
@@ -609,6 +610,8 @@ def test_serialize_column_transformer_pipeline(self):
         serialization2 = self.extension.model_to_flow(new_model)
         assert_flows_equal(serialization, serialization2)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="Pipeline processing behaviour updated")
     def test_serialize_feature_union(self):
         ohe_params = {'sparse': False}
         if LooseVersion(sklearn.__version__) >= "0.20":
@@ -675,16 +678,17 @@ def test_serialize_feature_union(self):
         self.assertEqual(new_model_params, fu_params)
         new_model.fit(self.X, self.y)
 
-        fu.set_params(scaler=None)
+        fu.set_params(scaler='drop')
         serialization = self.extension.model_to_flow(fu)
         self.assertEqual(serialization.name,
                          'sklearn.pipeline.FeatureUnion('
-                         'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
+                         'ohe=sklearn.preprocessing.{}.OneHotEncoder,'
+                         'scaler=drop)'
                          .format(module_name_encoder))
         new_model = self.extension.flow_to_model(serialization)
         self.assertEqual(type(new_model), type(fu))
         self.assertIsNot(new_model, fu)
-        self.assertIs(new_model.transformer_list[1][1], None)
+        self.assertIs(new_model.transformer_list[1][1], 'drop')
 
     def test_serialize_feature_union_switched_names(self):
         ohe_params = ({'categories': 'auto'}
@@ -1778,3 +1782,66 @@ def test_trim_flow_name(self):
 
         self.assertEqual("weka.IsolationForest",
                          SklearnExtension.trim_flow_name("weka.IsolationForest"))
+
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21",
+                     reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
+                            "Pipeline till 0.20 doesn't support indexing and 'passthrough'")
+    def test_run_on_model_with_empty_steps(self):
+        from sklearn.compose import ColumnTransformer
+        # testing 'drop', 'passthrough', None as non-actionable sklearn estimators
+        dataset = openml.datasets.get_dataset(128)
+        task = openml.tasks.get_task(59)
+
+        X, y, categorical_ind, feature_names = dataset.get_data(
+            target=dataset.default_target_attribute, dataset_format='array')
+        categorical_ind = np.array(categorical_ind)
+        cat_idx, = np.where(categorical_ind)
+        cont_idx, = np.where(~categorical_ind)
+
+        clf = make_pipeline(
+            ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                                     OneHotEncoder()), cat_idx.tolist()),
+                               ('cont', make_pipeline(SimpleImputer(strategy='median'),
+                                                      StandardScaler()), cont_idx.tolist())])
+        )
+
+        clf = sklearn.pipeline.Pipeline([
+            ('dummystep', 'passthrough'),  # adding 'passthrough' as an estimator
+            ('prep', clf),
+            ('classifier', sklearn.svm.SVC(gamma='auto'))
+        ])
+
+        # adding 'drop' to a ColumnTransformer
+        if not categorical_ind.any():
+            clf[1][0].set_params(cat='drop')
+        if not (~categorical_ind).any():
+            clf[1][0].set_params(cont='drop')
+
+        # serializing model with non-actionable step
+        run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
+
+        self.assertEqual(len(flow.components), 3)
+        self.assertEqual(flow.components['dummystep'], 'passthrough')
+        self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow))
+        self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow))
+        self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'],
+                        OpenMLFlow))
+        self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'],
+                         'drop')
+
+        # de-serializing flow to a model with non-actionable step
+        model = self.extension.flow_to_model(flow)
+        model.fit(X, y)
+        self.assertEqual(type(model), type(clf))
+        self.assertNotEqual(model, clf)
+        self.assertEqual(len(model.named_steps), 3)
+        self.assertEqual(model.named_steps['dummystep'], 'passthrough')
+
+    def test_sklearn_serialization_with_none_step(self):
+        msg = 'Cannot serialize objects of None type. Please use a valid ' \
+              'placeholder for None. Note that empty sklearn estimators can be ' \
+              'replaced with \'drop\' or \'passthrough\'.'
+        clf = sklearn.pipeline.Pipeline([('dummystep', None),
+                                         ('classifier', sklearn.svm.SVC(gamma='auto'))])
+        with self.assertRaisesRegex(ValueError, msg):
+            self.extension.model_to_flow(clf)