Skip to content

Commit bd3a12b

Browse files
authored
Merge pull request #348 from openml/clust
clustering unit test
2 parents 9664a0f + da5bb80 commit bd3a12b

1 file changed

Lines changed: 101 additions & 1 deletion

File tree

tests/test_flows/test_sklearn.py

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import sklearn.pipeline
2525
import sklearn.preprocessing
2626
import sklearn.tree
27+
import sklearn.cluster
2728

2829
import openml
2930
from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
@@ -100,6 +101,47 @@ def test_serialize_model(self, check_dependencies_mock):
100101

101102
self.assertEqual(check_dependencies_mock.call_count, 1)
102103

104+
105+
@mock.patch('openml.flows.sklearn_converter._check_dependencies')
106+
def test_serialize_model_clustering(self, check_dependencies_mock):
107+
model = sklearn.cluster.KMeans()
108+
109+
fixture_name = 'sklearn.cluster.k_means_.KMeans'
110+
fixture_description = 'Automatically created scikit-learn flow.'
111+
version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
112+
% sklearn.__version__
113+
fixture_parameters = \
114+
OrderedDict((('algorithm', '"auto"'),
115+
('copy_x', 'true'),
116+
('init', '"k-means++"'),
117+
('max_iter', '300'),
118+
('n_clusters', '8'),
119+
('n_init', '10'),
120+
('n_jobs', '1'),
121+
('precompute_distances', '"auto"'),
122+
('random_state', 'null'),
123+
('tol', '0.0001'),
124+
('verbose', '0')))
125+
126+
serialization = sklearn_to_flow(model)
127+
128+
self.assertEqual(serialization.name, fixture_name)
129+
self.assertEqual(serialization.class_name, fixture_name)
130+
self.assertEqual(serialization.description, fixture_description)
131+
self.assertEqual(serialization.parameters, fixture_parameters)
132+
self.assertEqual(serialization.dependencies, version_fixture)
133+
134+
new_model = flow_to_sklearn(serialization)
135+
136+
self.assertEqual(type(new_model), type(model))
137+
self.assertIsNot(new_model, model)
138+
139+
self.assertEqual(new_model.get_params(), model.get_params())
140+
new_model.fit(self.X)
141+
142+
self.assertEqual(check_dependencies_mock.call_count, 1)
143+
144+
103145
def test_serialize_model_with_subcomponent(self):
104146
model = sklearn.ensemble.AdaBoostClassifier(
105147
n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier())
@@ -202,6 +244,64 @@ def test_serialize_pipeline(self):
202244
self.assertEqual(new_model_params, fu_params)
203245
new_model.fit(self.X, self.y)
204246

247+
def test_serialize_pipeline_clustering(self):
248+
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
249+
km = sklearn.cluster.KMeans()
250+
model = sklearn.pipeline.Pipeline(steps=(
251+
('scaler', scaler), ('clusterer', km)))
252+
253+
fixture_name = 'sklearn.pipeline.Pipeline(' \
254+
'scaler=sklearn.preprocessing.data.StandardScaler,' \
255+
'clusterer=sklearn.cluster.k_means_.KMeans)'
256+
fixture_description = 'Automatically created scikit-learn flow.'
257+
258+
serialization = sklearn_to_flow(model)
259+
260+
self.assertEqual(serialization.name, fixture_name)
261+
self.assertEqual(serialization.description, fixture_description)
262+
263+
# Comparing the pipeline
264+
# The parameters only have the name of base objects(not the whole flow)
265+
# as value
266+
self.assertEqual(len(serialization.parameters), 1)
267+
# Hard to compare two representations of a dict due to possibly
268+
# different sorting. Making a json makes it easier
269+
self.assertEqual(json.loads(serialization.parameters['steps']),
270+
[{'oml-python:serialized_object':
271+
'component_reference', 'value': {'key': 'scaler', 'step_name': 'scaler'}},
272+
{'oml-python:serialized_object':
273+
'component_reference', 'value': {'key': 'clusterer', 'step_name': 'clusterer'}}])
274+
275+
# Checking the sub-component
276+
self.assertEqual(len(serialization.components), 2)
277+
self.assertIsInstance(serialization.components['scaler'],
278+
OpenMLFlow)
279+
self.assertIsInstance(serialization.components['clusterer'],
280+
OpenMLFlow)
281+
282+
# del serialization.model
283+
new_model = flow_to_sklearn(serialization)
284+
285+
self.assertEqual(type(new_model), type(model))
286+
self.assertIsNot(new_model, model)
287+
288+
self.assertEqual([step[0] for step in new_model.steps],
289+
[step[0] for step in model.steps])
290+
self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
291+
self.assertIsNot(new_model.steps[1][1], model.steps[1][1])
292+
293+
new_model_params = new_model.get_params()
294+
del new_model_params['scaler']
295+
del new_model_params['clusterer']
296+
del new_model_params['steps']
297+
fu_params = model.get_params()
298+
del fu_params['scaler']
299+
del fu_params['clusterer']
300+
del fu_params['steps']
301+
302+
self.assertEqual(new_model_params, fu_params)
303+
new_model.fit(self.X, self.y)
304+
205305
def test_serialize_feature_union(self):
206306
ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
207307
scaler = sklearn.preprocessing.StandardScaler()
@@ -597,4 +697,4 @@ def test_paralizable_check(self):
597697
self.assertTrue(_check_n_jobs(legal_models[i]) == answers[i])
598698

599699
for i in range(len(illegal_models)):
600-
self.assertRaises(PyOpenMLError, _check_n_jobs, illegal_models[i])
700+
self.assertRaises(PyOpenMLError, _check_n_jobs, illegal_models[i])

0 commit comments

Comments
 (0)