Skip to content

Commit 929fec1

Browse files
committed
extended the sklearn_flow conversion method with the notion of parent model. This way, we can check whether a user-defined keyword (e.g., the steps in a pipeline) does not shadow a model parameter
1 parent 565cc23 commit 929fec1

2 files changed

Lines changed: 45 additions & 8 deletions

File tree

openml/flows/sklearn_converter.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
# Necessary to have signature available in python 2.7
2020
from sklearn.utils.fixes import signature
2121

22-
from .flow import OpenMLFlow
22+
from openml.flows import OpenMLFlow
23+
from openml.exceptions import PyOpenMLError
2324

2425

2526
if sys.version_info >= (3, 5):
@@ -32,37 +33,52 @@
3233
'^(?P<name>[\w\-]+)((?P<operation>==|>=|>)(?P<version>(\d+\.)?(\d+\.)?(\d+)))?$')
3334

3435

35-
def sklearn_to_flow(o):
36+
def sklearn_to_flow(o, parent_model=None):
37+
# TODO: assert that only on first recursion lvl `parent_model` can be None
3638

3739
if _is_estimator(o):
40+
# is the main model or a submodel
3841
rval = _serialize_model(o)
3942
elif isinstance(o, (list, tuple)):
40-
rval = [sklearn_to_flow(element) for element in o]
43+
# TODO: explain what type of parameter is here
44+
rval = [sklearn_to_flow(element, parent_model) for element in o]
4145
if isinstance(o, tuple):
46+
assert(parent_model is not None)
47+
reserved_keywords = set(parent_model.get_params(deep=False).keys())
48+
if o[0] in reserved_keywords:
49+
parent_model_name = parent_model.__module__ + "." + \
50+
parent_model.__class__.__name__
51+
raise PyOpenMLError('Found element shadowing official ' + \
52+
'parameter for %s: %s' %(parent_model_name, o[0]))
4253
rval = tuple(rval)
4354
elif isinstance(o, (bool, int, float, six.string_types)) or o is None:
55+
# base parameter values
4456
rval = o
4557
elif isinstance(o, dict):
58+
# TODO: explain what type of parameter is here
4659
rval = OrderedDict()
4760
for key, value in o.items():
4861
if not isinstance(key, six.string_types):
4962
raise TypeError('Can only use string as keys, you passed '
5063
'type %s for value %s.' %
5164
(type(key), str(key)))
52-
key = sklearn_to_flow(key)
53-
value = sklearn_to_flow(value)
65+
key = sklearn_to_flow(key, parent_model)
66+
value = sklearn_to_flow(value, parent_model)
5467
rval[key] = value
5568
rval = rval
5669
elif isinstance(o, type):
70+
# TODO: explain what type of parameter is here
5771
rval = serialize_type(o)
5872
elif isinstance(o, scipy.stats.distributions.rv_frozen):
5973
rval = serialize_rv_frozen(o)
6074
# This only works for user-defined functions (and not even partial).
6175
# I think this is exactly what we want here as there shouldn't be any
6276
# built-in or functool.partials in a pipeline
6377
elif inspect.isfunction(o):
78+
# TODO: explain what type of parameter is here
6479
rval = serialize_function(o)
6580
elif _is_cross_validator(o):
81+
# TODO: explain what type of parameter is here
6682
rval = _serialize_cross_validator(o)
6783
else:
6884
raise TypeError(o, type(o))
@@ -256,7 +272,7 @@ def _extract_information_from_model(model):
256272

257273
model_parameters = model.get_params(deep=False)
258274
for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
259-
rval = sklearn_to_flow(v)
275+
rval = sklearn_to_flow(v, model)
260276

261277
if (isinstance(rval, (list, tuple)) and len(rval) > 0 and
262278
isinstance(rval[0], (list, tuple)) and
@@ -310,7 +326,7 @@ def _extract_information_from_model(model):
310326
component_reference[
311327
'oml-python:serialized_object'] = 'component_reference'
312328
component_reference['value'] = OrderedDict(key=k, step_name=None)
313-
component_reference = sklearn_to_flow(component_reference)
329+
component_reference = sklearn_to_flow(component_reference, model)
314330
parameters[k] = json.dumps(component_reference)
315331

316332
else:

tests/test_flows/test_sklearn.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
import sklearn.preprocessing
2525
import sklearn.tree
2626

27-
import openml
2827
from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
2928
from openml.flows.sklearn_converter import _format_external_version, _check_dependencies
29+
from openml.exceptions import PyOpenMLError
3030

3131
this_directory = os.path.dirname(os.path.abspath(__file__))
3232
sys.path.append(this_directory)
@@ -524,3 +524,24 @@ def test_check_dependencies(self):
524524
dependencies = ['sklearn==0.1', 'sklearn>=99.99.99', 'sklearn>99.99.99']
525525
for dependency in dependencies:
526526
self.assertRaises(ValueError, _check_dependencies, dependency)
527+
528+
def test_illegal_parameter_names(self):
529+
# illegal name: estimators
530+
clf1 = sklearn.ensemble.VotingClassifier(
531+
estimators=[('estimators', sklearn.ensemble.RandomForestClassifier()),
532+
('whatevs', sklearn.ensemble.ExtraTreesClassifier())])
533+
534+
535+
cases = [clf1]
536+
537+
for case in cases:
538+
self.assertRaises(PyOpenMLError, sklearn_to_flow, case)
539+
540+
def test_illegal_parameter_names_sklearn(self):
541+
# illegal name: steps
542+
steps = [
543+
('Imputer', sklearn.preprocessing.Imputer(strategy='median')),
544+
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
545+
('steps', sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier))
546+
]
547+
self.assertRaises(ValueError, sklearn.pipeline.Pipeline, {'steps': steps })

0 commit comments

Comments
 (0)