2424import sklearn .pipeline
2525import sklearn .preprocessing
2626import sklearn .tree
27+ import sklearn .cluster
2728
2829import openml
2930from openml .flows import OpenMLFlow , sklearn_to_flow , flow_to_sklearn
@@ -100,6 +101,47 @@ def test_serialize_model(self, check_dependencies_mock):
100101
101102 self .assertEqual (check_dependencies_mock .call_count , 1 )
102103
104+
105+ @mock .patch ('openml.flows.sklearn_converter._check_dependencies' )
106+ def test_serialize_model_clustering (self , check_dependencies_mock ):
107+ model = sklearn .cluster .KMeans ()
108+
109+ fixture_name = 'sklearn.cluster.k_means_.KMeans'
110+ fixture_description = 'Automatically created scikit-learn flow.'
111+ version_fixture = 'sklearn==%s\n numpy>=1.6.1\n scipy>=0.9' \
112+ % sklearn .__version__
113+ fixture_parameters = \
114+ OrderedDict ((('algorithm' , '"auto"' ),
115+ ('copy_x' , 'true' ),
116+ ('init' , '"k-means++"' ),
117+ ('max_iter' , '300' ),
118+ ('n_clusters' , '8' ),
119+ ('n_init' , '10' ),
120+ ('n_jobs' , '1' ),
121+ ('precompute_distances' , '"auto"' ),
122+ ('random_state' , 'null' ),
123+ ('tol' , '0.0001' ),
124+ ('verbose' , '0' )))
125+
126+ serialization = sklearn_to_flow (model )
127+
128+ self .assertEqual (serialization .name , fixture_name )
129+ self .assertEqual (serialization .class_name , fixture_name )
130+ self .assertEqual (serialization .description , fixture_description )
131+ self .assertEqual (serialization .parameters , fixture_parameters )
132+ self .assertEqual (serialization .dependencies , version_fixture )
133+
134+ new_model = flow_to_sklearn (serialization )
135+
136+ self .assertEqual (type (new_model ), type (model ))
137+ self .assertIsNot (new_model , model )
138+
139+ self .assertEqual (new_model .get_params (), model .get_params ())
140+ new_model .fit (self .X )
141+
142+ self .assertEqual (check_dependencies_mock .call_count , 1 )
143+
144+
103145 def test_serialize_model_with_subcomponent (self ):
104146 model = sklearn .ensemble .AdaBoostClassifier (
105147 n_estimators = 100 , base_estimator = sklearn .tree .DecisionTreeClassifier ())
@@ -202,6 +244,64 @@ def test_serialize_pipeline(self):
202244 self .assertEqual (new_model_params , fu_params )
203245 new_model .fit (self .X , self .y )
204246
247+ def test_serialize_pipeline_clustering (self ):
248+ scaler = sklearn .preprocessing .StandardScaler (with_mean = False )
249+ km = sklearn .cluster .KMeans ()
250+ model = sklearn .pipeline .Pipeline (steps = (
251+ ('scaler' , scaler ), ('clusterer' , km )))
252+
253+ fixture_name = 'sklearn.pipeline.Pipeline(' \
254+ 'scaler=sklearn.preprocessing.data.StandardScaler,' \
255+ 'clusterer=sklearn.cluster.k_means_.KMeans)'
256+ fixture_description = 'Automatically created scikit-learn flow.'
257+
258+ serialization = sklearn_to_flow (model )
259+
260+ self .assertEqual (serialization .name , fixture_name )
261+ self .assertEqual (serialization .description , fixture_description )
262+
263+ # Comparing the pipeline
264+ # The parameters only have the name of base objects(not the whole flow)
265+ # as value
266+ self .assertEqual (len (serialization .parameters ), 1 )
267+ # Hard to compare two representations of a dict due to possibly
268+ # different sorting. Making a json makes it easier
269+ self .assertEqual (json .loads (serialization .parameters ['steps' ]),
270+ [{'oml-python:serialized_object' :
271+ 'component_reference' , 'value' : {'key' : 'scaler' , 'step_name' : 'scaler' }},
272+ {'oml-python:serialized_object' :
273+ 'component_reference' , 'value' : {'key' : 'clusterer' , 'step_name' : 'clusterer' }}])
274+
275+ # Checking the sub-component
276+ self .assertEqual (len (serialization .components ), 2 )
277+ self .assertIsInstance (serialization .components ['scaler' ],
278+ OpenMLFlow )
279+ self .assertIsInstance (serialization .components ['clusterer' ],
280+ OpenMLFlow )
281+
282+ # del serialization.model
283+ new_model = flow_to_sklearn (serialization )
284+
285+ self .assertEqual (type (new_model ), type (model ))
286+ self .assertIsNot (new_model , model )
287+
288+ self .assertEqual ([step [0 ] for step in new_model .steps ],
289+ [step [0 ] for step in model .steps ])
290+ self .assertIsNot (new_model .steps [0 ][1 ], model .steps [0 ][1 ])
291+ self .assertIsNot (new_model .steps [1 ][1 ], model .steps [1 ][1 ])
292+
293+ new_model_params = new_model .get_params ()
294+ del new_model_params ['scaler' ]
295+ del new_model_params ['clusterer' ]
296+ del new_model_params ['steps' ]
297+ fu_params = model .get_params ()
298+ del fu_params ['scaler' ]
299+ del fu_params ['clusterer' ]
300+ del fu_params ['steps' ]
301+
302+ self .assertEqual (new_model_params , fu_params )
303+ new_model .fit (self .X , self .y )
304+
205305 def test_serialize_feature_union (self ):
206306 ohe = sklearn .preprocessing .OneHotEncoder (sparse = False )
207307 scaler = sklearn .preprocessing .StandardScaler ()
@@ -597,4 +697,4 @@ def test_paralizable_check(self):
597697 self .assertTrue (_check_n_jobs (legal_models [i ]) == answers [i ])
598698
599699 for i in range (len (illegal_models )):
600- self .assertRaises (PyOpenMLError , _check_n_jobs , illegal_models [i ])
700+ self .assertRaises (PyOpenMLError , _check_n_jobs , illegal_models [i ])
0 commit comments