Skip to content

Commit f4df535

Browse files
authored
Merge pull request #222 from openml/issue209
Issue209
2 parents a9ca0b8 + 076ef50 commit f4df535

4 files changed

Lines changed: 61 additions & 10 deletions

File tree

openml/flows/sklearn_converter.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
# Necessary to have signature available in python 2.7
2020
from sklearn.utils.fixes import signature
2121

22-
from .flow import OpenMLFlow
22+
from openml.flows import OpenMLFlow
23+
from openml.exceptions import PyOpenMLError
2324

2425

2526
if sys.version_info >= (3, 5):
@@ -32,37 +33,45 @@
3233
'^(?P<name>[\w\-]+)((?P<operation>==|>=|>)(?P<version>(\d+\.)?(\d+\.)?(\d+)))?$')
3334

3435

35-
def sklearn_to_flow(o):
36+
def sklearn_to_flow(o, parent_model=None):
37+
# TODO: assert that only on first recursion lvl `parent_model` can be None
3638

3739
if _is_estimator(o):
40+
# is the main model or a submodel
3841
rval = _serialize_model(o)
3942
elif isinstance(o, (list, tuple)):
40-
rval = [sklearn_to_flow(element) for element in o]
43+
# TODO: explain what type of parameter is here
44+
rval = [sklearn_to_flow(element, parent_model) for element in o]
4145
if isinstance(o, tuple):
4246
rval = tuple(rval)
4347
elif isinstance(o, (bool, int, float, six.string_types)) or o is None:
48+
# base parameter values
4449
rval = o
4550
elif isinstance(o, dict):
51+
# TODO: explain what type of parameter is here
4652
rval = OrderedDict()
4753
for key, value in o.items():
4854
if not isinstance(key, six.string_types):
4955
raise TypeError('Can only use string as keys, you passed '
5056
'type %s for value %s.' %
5157
(type(key), str(key)))
52-
key = sklearn_to_flow(key)
53-
value = sklearn_to_flow(value)
58+
key = sklearn_to_flow(key, parent_model)
59+
value = sklearn_to_flow(value, parent_model)
5460
rval[key] = value
5561
rval = rval
5662
elif isinstance(o, type):
63+
# TODO: explain what type of parameter is here
5764
rval = serialize_type(o)
5865
elif isinstance(o, scipy.stats.distributions.rv_frozen):
5966
rval = serialize_rv_frozen(o)
6067
# This only works for user-defined functions (and not even partial).
6168
# I think this is exactly what we want here as there shouldn't be any
6269
# built-in or functool.partials in a pipeline
6370
elif inspect.isfunction(o):
71+
# TODO: explain what type of parameter is here
6472
rval = serialize_function(o)
6573
elif _is_cross_validator(o):
74+
# TODO: explain what type of parameter is here
6675
rval = _serialize_cross_validator(o)
6776
else:
6877
raise TypeError(o, type(o))
@@ -256,18 +265,26 @@ def _extract_information_from_model(model):
256265

257266
model_parameters = model.get_params(deep=False)
258267
for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
259-
rval = sklearn_to_flow(v)
268+
rval = sklearn_to_flow(v, model)
260269

261270
if (isinstance(rval, (list, tuple)) and len(rval) > 0 and
262271
isinstance(rval[0], (list, tuple)) and
263272
[type(rval[0]) == type(rval[i]) for i in range(len(rval))]):
264273

265-
# Steps in a pipeline or feature union
274+
# Steps in a pipeline or feature union, or base classifiers in voting classifier
266275
parameter_value = list()
276+
reserved_keywords = set(model.get_params(deep=False).keys())
277+
267278
for sub_component_tuple in rval:
268279
identifier, sub_component = sub_component_tuple
269280
sub_component_type = type(sub_component_tuple)
270281

282+
if identifier in reserved_keywords:
283+
parent_model_name = model.__module__ + "." + \
284+
model.__class__.__name__
285+
raise PyOpenMLError('Found element shadowing official ' + \
286+
'parameter for %s: %s' % (parent_model_name, identifier))
287+
271288
if sub_component is None:
272289
# In a FeatureUnion it is legal to have a None step
273290

@@ -310,7 +327,7 @@ def _extract_information_from_model(model):
310327
component_reference[
311328
'oml-python:serialized_object'] = 'component_reference'
312329
component_reference['value'] = OrderedDict(key=k, step_name=None)
313-
component_reference = sklearn_to_flow(component_reference)
330+
component_reference = sklearn_to_flow(component_reference, model)
314331
parameters[k] = json.dumps(component_reference)
315332

316333
else:

openml/runs/run.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ def get_flow_dict(_flow):
231231
elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
232232
# tolerate
233233
pass
234+
elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"):
235+
# tolerate
236+
pass
234237
else:
235238
raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
236239

tests/test_flows/test_sklearn.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
import sklearn.preprocessing
2525
import sklearn.tree
2626

27-
import openml
2827
from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
2928
from openml.flows.sklearn_converter import _format_external_version, _check_dependencies
29+
from openml.exceptions import PyOpenMLError
3030

3131
this_directory = os.path.dirname(os.path.abspath(__file__))
3232
sys.path.append(this_directory)
@@ -524,3 +524,34 @@ def test_check_dependencies(self):
524524
dependencies = ['sklearn==0.1', 'sklearn>=99.99.99', 'sklearn>99.99.99']
525525
for dependency in dependencies:
526526
self.assertRaises(ValueError, _check_dependencies, dependency)
527+
528+
def test_illegal_parameter_names(self):
529+
# illegal name: estimators
530+
clf1 = sklearn.ensemble.VotingClassifier(
531+
estimators=[('estimators', sklearn.ensemble.RandomForestClassifier()),
532+
('whatevs', sklearn.ensemble.ExtraTreesClassifier())])
533+
clf2 = sklearn.ensemble.VotingClassifier(
534+
estimators=[('whatevs', sklearn.ensemble.RandomForestClassifier()),
535+
('estimators', sklearn.ensemble.ExtraTreesClassifier())])
536+
cases = [clf1, clf2]
537+
538+
for case in cases:
539+
self.assertRaises(PyOpenMLError, sklearn_to_flow, case)
540+
541+
def test_illegal_parameter_names_pipeline(self):
542+
# illegal name: steps
543+
steps = [
544+
('Imputer', sklearn.preprocessing.Imputer(strategy='median')),
545+
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
546+
('steps', sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier))
547+
]
548+
self.assertRaises(ValueError, sklearn.pipeline.Pipeline, steps=steps)
549+
550+
551+
def test_illegal_parameter_names_featureunion(self):
552+
# illegal name: transformer_list
553+
transformer_list = [
554+
('transformer_list', sklearn.preprocessing.Imputer(strategy='median')),
555+
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
556+
]
557+
self.assertRaises(ValueError, sklearn.pipeline.FeatureUnion, transformer_list=transformer_list)

tests/test_runs/test_run_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def test_get_runs_list_by_uploader(self):
205205
uploader_ids = [29]
206206

207207
runs = openml.runs.list_runs(uploader=uploader_ids)
208-
self.assertGreaterEqual(len(runs), 3)
208+
self.assertGreaterEqual(len(runs), 2)
209209
for rid in runs:
210210
self.assertIn(runs[rid]['uploader'], uploader_ids)
211211
self._check_run(runs[rid])

0 commit comments

Comments
 (0)