FIX/ENH naming problem, add more asserts to unit tests

mfeurer · mfeurer · commit 7e6a5454daf9 · 2016-09-12T15:46:56.000+02:00
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -5,7 +5,7 @@
 from ..util import URLError
 
 
-def get_flow(flow_id):
+def get_flow(flow_id, converter=None):
     """Download the OpenML flow for a given flow ID.
 
     Parameters
@@ -28,4 +28,9 @@ def get_flow(flow_id):
 
     flow_dict = xmltodict.parse(flow_xml)
     flow = OpenMLFlow._from_xml(flow_dict)
+
+    if converter is not None:
+        model = converter.deserialize_object(flow)
+        flow.model = model
+
     return flow
diff --git a/openml/flows/sklearn_converter.py b/openml/flows/sklearn_converter.py
@@ -189,7 +189,20 @@ def _serialize_model(self, model):
         for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
             rval = self.serialize_object(v)
 
+
+            # In case of pipelines, or when having a component (for example
+            # in the AdaBoostClassifier or the RandomizedSearchCV), the
+            # parameters of that component are also returned by get_params()
+            # This check makes sure that we only add the parameters for the
+            # current component, and not the ones of the child component.
+            # Parameters of child components will be added to the correct flow
+            # when deserializing the subflow
+            model_parameters = signature(model.__init__)
+            if k not in model_parameters.parameters:
+                continue
+
             if isinstance(rval, (list, tuple)):
+
                 # Steps in a pipeline or feature union
                 parameter_value = list()
                 for identifier, sub_component in rval:
@@ -221,18 +234,6 @@ def _serialize_model(self, model):
                 parameters[k] = parameter_value
 
             elif isinstance(rval, OpenMLFlow):
-                # Since serialize_object can return a Flow, we need to check
-                # whether that flow represents a hyperparameter value of the
-                # current flow or of a subcomponent. We only add it to the
-                # parameters in the first case. In the second case, it will
-                # be added to the correct flow when deserializing the subflow
-                # (which happens either in the body of the if statement above
-                # or in this body when the component is the value of a
-                # hyperparameter, as it could be for example in the
-                # AdaBoostClassifier.
-                model_parameters = signature(model.__init__)
-                if k not in model_parameters.parameters:
-                    continue
 
                 # A subcomponent, for example the base model in
                 # AdaBoostClassifier
@@ -244,14 +245,6 @@ def _serialize_model(self, model):
                 parameters[k] = json.dumps(component_reference)
 
             else:
-                # In case of pipelines, or when having a component (for example
-                # in the AdaBoostClassifier or the RandomizedSearchCV), the
-                # parameters of that component are also returned by get_params()
-                # This check makes sure that we only add the parameters for the
-                # current component, and not the ones of the child component.
-                model_parameters = signature(model.__init__)
-                if k not in model_parameters.parameters:
-                    continue
 
                 # a regular hyperparameter
                 if not (hasattr(rval, '__len__') and len(rval) == 0):
@@ -289,7 +282,7 @@ def _serialize_model(self, model):
 
     def _deserialize_model(self, flow, **kwargs):
 
-        model_name = flow.name
+        model_name = flow._get_name()
         # Remove everything after the first bracket, it is not necessary for
         # creating the current flow
         pos = model_name.find('(')
@@ -434,6 +427,7 @@ def serialize_cross_validator(self, o):
                 warnings.filters.pop(0)
 
             if not (hasattr(value, '__len__') and len(value) == 0):
+                value = json.dumps(value)
                 parameters[key] = value
             else:
                 parameters[key] = None
diff --git a/tests/flows/test_flow.py b/tests/flows/test_flow.py
@@ -7,6 +7,7 @@
 import xmltodict
 
 import scipy.stats
+import sklearn.datasets
 import sklearn.dummy
 import sklearn.ensemble
 import sklearn.model_selection
@@ -123,17 +124,22 @@ def test_publish_flow(self, name_mock):
 
     @mock.patch.object(openml.OpenMLFlow, '_get_name', autospec=True)
     def test_sklearn_to_upload_to_flow(self, name_mock):
+        iris = sklearn.datasets.load_iris()
+        X = iris.data
+        y = iris.target
+
         # Create a unique prefix for the flow. Necessary because the flow is
         # identified by its name and external version online. Having a unique
         #  name allows us to publish the same flow in each test run
         md5 = hashlib.md5()
         md5.update(str(time.time()).encode('utf-8'))
         sentinel = md5.hexdigest()[:10]
+        sentinel = 'TEST%s' % sentinel
         def side_effect(self):
             if sentinel in self.name:
                 return self.name
             else:
-                return 'TEST%s%s' % (sentinel, self.name)
+                return '%s%s' % (sentinel, self.name)
         name_mock.side_effect = side_effect
 
         # Test a more complicated flow
@@ -142,18 +148,32 @@ def side_effect(self):
             base_estimator=sklearn.tree.DecisionTreeClassifier())
         model = sklearn.pipeline.Pipeline(steps=(
             ('scaler', scaler), ('boosting', boosting)))
-        parameter_grid = {'n_estimators': [1, 5, 10, 100],
-                          'learning_rate': scipy.stats.uniform(0.01, 0.99),
-                          'base_estimator__max_depth': scipy.stats.randint(1, 10)}
+        parameter_grid = {'boosting__n_estimators': [1, 5, 10, 100],
+                          'boosting__learning_rate': scipy.stats.uniform(0.01, 0.99),
+                          'boosting__base_estimator__max_depth': scipy.stats.randint(1, 10)}
+        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
         rs = sklearn.model_selection.RandomizedSearchCV(
-            estimator=model, param_distributions=parameter_grid)
+            estimator=model, param_distributions=parameter_grid, cv=cv)
+        rs.fit(X, y)
         flow = openml.flows.create_flow_from_model(rs, SklearnToFlowConverter())
 
         flow.publish()
         self.assertIsInstance(flow.flow_id, int)
 
         # Check whether we can load the flow again
-        new_flow = openml.flows.get_flow(flow_id=flow.flow_id)
+        # Remove the sentinel from the name again so that we can reinstantiate
+        # the object again
+        def side_effect(self):
+            if sentinel in self.name:
+                name = self.name.replace(sentinel, '')
+                return name
+            else:
+                return self.name
+        name_mock.side_effect = side_effect
+
+        name_mock.side_effect = side_effect
+        new_flow = openml.flows.get_flow(flow_id=flow.flow_id,
+                                         converter=SklearnToFlowConverter())
 
         local_xml = flow._to_xml()
         server_xml = new_flow._to_xml()
@@ -175,4 +195,14 @@ def side_effect(self):
 
         self.assertEqual(new_flow, flow)
         self.assertIsNot(new_flow, flow)
+        new_flow.model.fit(X, y)
+
+        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
+                       'sklearn.model_selection._split.StratifiedKFold,' \
+                       'sklearn.pipeline.Pipeline(' \
+                       'sklearn.preprocessing.data.StandardScaler,' \
+                       'sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
+                       'sklearn.tree.tree.DecisionTreeClassifier)))'
+
+        self.assertEqual(new_flow._get_name(), fixture_name)
 
diff --git a/tests/flows/test_sklearn.py b/tests/flows/test_sklearn.py
@@ -163,6 +163,10 @@ def test_serialize_feature_union(self):
         fu = sklearn.pipeline.FeatureUnion(transformer_list=(('ohe', ohe),
                                                              ('scaler', scaler)))
         serialization =  self.converter.serialize_object(fu)
+        self.assertEqual(serialization.name,
+                         'sklearn.pipeline.FeatureUnion('
+                         'sklearn.preprocessing.data.OneHotEncoder,'
+                         'sklearn.preprocessing.data.StandardScaler)')
         new_model = self.converter.deserialize_object(serialization)
 
         self.assertEqual(type(new_model), type(fu))
@@ -193,6 +197,30 @@ def test_serialize_feature_union(self):
         self.assertEqual(new_model_params, fu_params)
         new_model.fit(self.X, self.y)
 
+    def test_serialize_complex_flow(self):
+        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
+
+        boosting = sklearn.ensemble.AdaBoostClassifier(
+            base_estimator=sklearn.tree.DecisionTreeClassifier())
+        model = sklearn.pipeline.Pipeline(steps=(
+            ('scaler', scaler), ('boosting', boosting)))
+        parameter_grid = {'n_estimators': [1, 5, 10, 100],
+                          'learning_rate': scipy.stats.uniform(0.01, 0.99),
+                          'base_estimator__max_depth': scipy.stats.randint(1,
+                                                                           10)}
+        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
+        rs = sklearn.model_selection.RandomizedSearchCV(
+            estimator=model, param_distributions=parameter_grid, cv=cv)
+        serialized = self.converter.serialize_object(rs)
+
+        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
+                       'sklearn.model_selection._split.StratifiedKFold,' \
+                       'sklearn.pipeline.Pipeline(' \
+                       'sklearn.preprocessing.data.StandardScaler,' \
+                       'sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
+                       'sklearn.tree.tree.DecisionTreeClassifier)))'
+        self.assertEqual(serialized.name, fixture_name)
+
     def test_serialize_type(self):
         supported_types = [float, np.float, np.float32, np.float64,
                            int, np.int, np.int32, np.int64]