Skip to content

Commit dfff969

Browse files
committed
ADD propagate external version of subflows to parent flow
1 parent f2c7a42 commit dfff969

5 files changed

Lines changed: 58 additions & 8 deletions

File tree

openml/flows/sklearn_converter.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@ def _serialize_model(model):
257257

258258
# Create a flow name, which contains all components in brackets, for
259259
# example RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)),StandardScaler,AdaBoostClassifier(DecisionTreeClassifier))
260-
# TODO the name above is apparently wrong, I need to test and check this
261260
class_name = model.__module__ + "." + model.__class__.__name__
262261

263262
# will be part of the name (in brackets)
@@ -274,7 +273,24 @@ def _serialize_model(model):
274273
else:
275274
name = class_name
276275

277-
external_version = _get_external_version_info()
276+
# Get the external versions of all sub-components
277+
model_package_name = model.__module__.split('.')[0]
278+
module = importlib.import_module(model_package_name)
279+
model_package_version_number = module.__version__
280+
external_version = '%s==%s' % (model_package_name, model_package_version_number)
281+
282+
external_versions = set()
283+
external_versions.add(external_version)
284+
to_visit_stack = []
285+
to_visit_stack.extend(sub_components.values())
286+
while len(to_visit_stack) > 0:
287+
visitee = to_visit_stack.pop()
288+
for external_version in visitee.external_version.split(','):
289+
external_versions.add(external_version)
290+
to_visit_stack.extend(visitee.components.values())
291+
external_versions = list(sorted(external_versions))
292+
external_version = ','.join(external_versions)
293+
278294
flow = OpenMLFlow(name=name,
279295
class_name=class_name,
280296
description='Automatically created sub-component.',
@@ -470,7 +486,3 @@ def _deserialize_cross_validator(value, **kwargs):
470486
for parameter in parameters:
471487
parameters[parameter] = flow_to_sklearn(parameters[parameter])
472488
return model_class(**parameters)
473-
474-
475-
def _get_external_version_info():
476-
return 'sklearn_' + sklearn.__version__

tests/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Dummy to allow mock classes in the test files to have a version number for
2+
# their parent module
3+
__version__ = '0.1'
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = 1.0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
class DummyRegressor(object):
2+
def fit(self, X, y):
3+
return self
4+
5+
def predict(self, X):
6+
return X[:, 0]
7+
8+
def get_params(self, deep=False):
9+
return {}
10+
11+
def set_params(self, params):
12+
return None

tests/flows/test_sklearn.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from collections import OrderedDict
22
import json
3+
import os
4+
import sys
35
import unittest
46

57
import numpy as np
@@ -61,6 +63,7 @@ def test_serialize_model(self):
6163
serialization = sklearn_to_flow(model)
6264

6365
self.assertEqual(serialization.name, fixture_name)
66+
self.assertEqual(serialization.class_name, fixture_name)
6467
self.assertEqual(serialization.description, fixture_description)
6568
self.assertEqual(serialization.parameters, fixture_parameters)
6669

@@ -78,16 +81,21 @@ def test_serialize_model_with_subcomponent(self):
7881

7982
fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
8083
'(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
84+
fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
8185
fixture_description = 'Automatically created sub-component.'
86+
fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
8287

8388
serialization = sklearn_to_flow(model)
8489

8590
self.assertEqual(serialization.name, fixture_name)
91+
self.assertEqual(serialization.class_name, fixture_class_name)
8692
self.assertEqual(serialization.description, fixture_description)
8793
self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
8894
self.assertIsInstance(serialization.parameters['base_estimator'], str)
8995
self.assertEqual(serialization.parameters['learning_rate'], '1.0')
9096
self.assertEqual(serialization.parameters['n_estimators'], '100')
97+
self.assertEqual(serialization.components['base_estimator'].class_name,
98+
fixture_subcomponent_class_name)
9199

92100
new_model = flow_to_sklearn(serialization)
93101

@@ -403,7 +411,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
403411
"\('pca2', PCA\(copy=True, iterated_power='auto', " \
404412
"n_components=None, random_state=None,\n" \
405413
" svd_solver='auto', tol=0.0, whiten=False\)\)\)\)."
406-
#self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, pipeline)
414+
self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, pipeline)
407415

408416
fu = sklearn.pipeline.FeatureUnion((('pca1', pca), ('pca2', pca2)))
409417
fixture = "Found a second occurence of component sklearn.decomposition.pca.PCA when trying to serialize " \
@@ -416,7 +424,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
416424
" n_components=None, random_state=None,\n" \
417425
" svd_solver='auto', tol=0.0, whiten=False\)\)\),\n" \
418426
" transformer_weights=None\)."
419-
#self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, fu)
427+
self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, fu)
420428

421429
fs = sklearn.feature_selection.SelectKBest()
422430
fu2 = sklearn.pipeline.FeatureUnion((('pca1', pca), ('fs', fs)))
@@ -435,3 +443,17 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
435443
" n_components=None, random_state=None,\n" \
436444
" svd_solver='auto', tol=0.0, whiten=False\)\)\)\)."
437445
self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, pipeline2)
446+
447+
def test_subflow_version_change(self):
448+
this_directory = os.path.dirname(os.path.abspath(__file__))
449+
sys.path.append(this_directory)
450+
import dummy_learn
451+
import dummy_learn.dummy_forest
452+
pca = sklearn.decomposition.PCA()
453+
dummy = dummy_learn.dummy_forest.DummyRegressor()
454+
pipeline = sklearn.pipeline.Pipeline((('pca', pca), ('dummy', dummy)))
455+
flow = sklearn_to_flow(pipeline)
456+
self.assertEqual(flow.external_version, 'dummy_learn==1.0,sklearn==0.18.1')
457+
dummy_learn.__version__ = '1.1.0'
458+
flow = sklearn_to_flow(pipeline)
459+
self.assertEqual(flow.external_version, 'dummy_learn==1.1.0,sklearn==0.18.1')

0 commit comments

Comments
 (0)