Skip to content

Commit faf5b26

Browse files
committed
work on comments from Jan
1 parent 7064899 commit faf5b26

5 files changed

Lines changed: 60 additions & 17 deletions

File tree

openml/flows/flow.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,10 +324,15 @@ def _from_dict(cls, xml_dict):
324324
arguments['model'] = None
325325
flow = cls(**arguments)
326326

327-
if arguments['external_version'].startswith('sklearn'):
327+
# try to parse to a model because not everything that can be
328+
# deserialized has to come from scikit-learn. If it can't be
329+
# serialized, but comes from scikit-learn this is worth an exception
330+
try:
328331
from .sklearn_converter import flow_to_sklearn
329332
model = flow_to_sklearn(flow)
330-
else:
333+
except Exception as e:
334+
if arguments['external_version'].startswith('sklearn'):
335+
raise e
331336
model = None
332337
flow.model = model
333338

openml/runs/functions.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
3131
seed=None):
32+
"""See ``run_flow_on_task for a documentation."""
33+
3234
flow = sklearn_to_flow(model)
3335

3436
return run_flow_on_task(task=task, flow=flow,
@@ -38,23 +40,29 @@ def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
3840

3941
def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
4042
seed=None):
41-
"""Performs a CV run on the dataset of the given task, using the split.
43+
"""Run the model provided by the flow on the dataset defined by task.
44+
45+
Takes the flow and repeat information into account. In case a flow is not
46+
yet published, it is published after executing the run (requires
47+
internet connection).
4248
4349
Parameters
4450
----------
4551
task : OpenMLTask
4652
Task to perform.
4753
model : sklearn model
48-
a model which has a function fit(X,Y) and predict(X),
54+
A model which has a function fit(X,Y) and predict(X),
4955
all supervised estimators of scikit learn follow this definition of a model [1]
5056
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
5157
avoid_duplicate_runs : bool
52-
if this flag is set to True, the run will throw an error if the
53-
setup/task combination is already present on the server.
58+
If this flag is set to True, the run will throw an error if the
59+
setup/task combination is already present on the server. Works only
60+
if the flow is already published on the server. This feature requires an
61+
internet connection.
5462
flow_tags : list(str)
55-
a list of tags that the flow should have at creation
63+
A list of tags that the flow should have at creation.
5664
seed: int
57-
the models that are not seeded will get this seed
65+
Models that are not seeded will get this seed.
5866
5967
Returns
6068
-------

openml/runs/run.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,17 @@ def extract_parameters(_flow, _flow_dict, component_model,
190190
# _flow is openml flow object, _param dict maps from flow name to flow id
191191
# for the main call, the param dict can be overridden (useful for unit tests / sentinels)
192192
# this way, for flows without subflows we do not have to rely on _flow_dict
193+
expected_parameters = set(_flow.parameters)
194+
expected_components = set(_flow.components)
195+
model_parameters = set([mp for mp in component_model.get_params()
196+
if '__' not in mp])
197+
if len((expected_parameters | expected_components) ^ model_parameters) != 0:
198+
raise ValueError('Parameters of the model do not match the '
199+
'parameters expected by the '
200+
'flow:\nexpected flow parameters: '
201+
'%s\nmodel parameters: %s' % (
202+
sorted(expected_parameters| expected_components), sorted(model_parameters)))
203+
193204
_params = []
194205
for _param_name in _flow.parameters:
195206
_current = OrderedDict()
@@ -198,7 +209,9 @@ def extract_parameters(_flow, _flow_dict, component_model,
198209
_tmp = openml.flows.sklearn_to_flow(
199210
component_model.get_params()[_param_name])
200211

201-
# Try to filter out components which are handled further down!
212+
# Try to filter out components (a.k.a. subflows) which are
213+
# handled further down in the code (by recursively calling
214+
# this function)!
202215
if isinstance(_tmp, openml.flows.OpenMLFlow):
203216
continue
204217
try:
@@ -210,7 +223,19 @@ def extract_parameters(_flow, _flow_dict, component_model,
210223
# Object of type 'OpenMLFlow' is not JSON serializable
211224
if 'OpenMLFlow' in e.args[0] and \
212225
'is not JSON serializable' in e.args[0]:
226+
# Additional check that the parameter that could not
227+
# be parsed is actually a list/tuple which is used
228+
# inside a feature union or pipeline
229+
if not isinstance(_tmp, (list, tuple)):
230+
raise e
231+
for step_name, step in _tmp:
232+
if isinstance(step_name, openml.flows.OpenMLFlow):
233+
raise e
234+
elif not isinstance(step, openml.flows.OpenMLFlow):
235+
raise e
213236
continue
237+
else:
238+
raise e
214239

215240
_current['oml:value'] = _tmp
216241
if _main_call:

openml/setups/functions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,14 @@ def _reconstruct_flow(_flow, _params):
110110
# dicts, mapping from flow id to param name to param value
111111
# (obtained by using the subfunction _to_dict_of_dicts)
112112
for _param in _flow.parameters:
113+
# It can happen that no parameters of a flow are in a setup,
114+
# then the flow_id is not in _params; usually happens for a
115+
# sklearn.pipeline.Pipeline object, where the steps parameter is
116+
# not in the setup
113117
if _flow.flow_id not in _params:
114118
continue
119+
# It is not guaranteed that a setup on OpenML has all parameter
120+
# settings of a flow, thus a param must not be in _params!
115121
if _param not in _params[_flow.flow_id]:
116122
continue
117123
_flow.parameters[_param] = _params[_flow.flow_id][_param]

tests/test_runs/test_run_functions.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -229,16 +229,16 @@ def test_run_and_upload(self):
229229
num_iterations = 5 # for base search classifiers
230230

231231
clfs = []
232-
random_state_values = []
232+
random_state_fixtures = []
233233

234234
lr = LogisticRegression()
235235
clfs.append(lr)
236-
random_state_values.append('62501')
236+
random_state_fixtures.append('62501')
237237

238238
pipeline1 = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
239239
('dummy', DummyClassifier(strategy='prior'))])
240240
clfs.append(pipeline1)
241-
random_state_values.append('62501')
241+
random_state_fixtures.append('62501')
242242

243243
pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
244244
('VarianceThreshold', VarianceThreshold()),
@@ -248,13 +248,13 @@ def test_run_and_upload(self):
248248
'min_samples_leaf': [2 ** x for x in range(0, 6 + 1)]},
249249
cv=3, n_iter=10))])
250250
clfs.append(pipeline2)
251-
random_state_values.append('62501')
251+
random_state_fixtures.append('62501')
252252

253253
gridsearch = GridSearchCV(BaggingClassifier(base_estimator=SVC()),
254254
{"base_estimator__C": [0.01, 0.1, 10],
255255
"base_estimator__gamma": [0.01, 0.1, 10]})
256256
clfs.append(gridsearch)
257-
random_state_values.append('62501')
257+
random_state_fixtures.append('62501')
258258

259259
randomsearch = RandomizedSearchCV(
260260
RandomForestClassifier(n_estimators=5),
@@ -271,9 +271,9 @@ def test_run_and_upload(self):
271271
# The random states for the RandomizedSearchCV is set after the
272272
# random state of the RandomForestClassifier is set, therefore,
273273
# it has a different value than the other examples before
274-
random_state_values.append('33003')
274+
random_state_fixtures.append('33003')
275275

276-
for clf, rsv in zip(clfs, random_state_values):
276+
for clf, rsv in zip(clfs, random_state_fixtures):
277277
run = self._perform_run(task_id, num_test_instances, clf,
278278
random_state_value=rsv)
279279
if isinstance(clf, BaseSearchCV):
@@ -311,7 +311,6 @@ def test_initialize_model_from_run(self):
311311

312312
self.assertEquals(flowS.components['Imputer'].parameters['strategy'], '"median"')
313313
self.assertEquals(flowS.components['VarianceThreshold'].parameters['threshold'], '0.05')
314-
pass
315314

316315
def test_get_run_trace(self):
317316
# get_run_trace is already tested implicitly in test_run_and_publish

0 commit comments

Comments
 (0)