Skip to content

Commit 9d6bea1

Browse files
committed
make unit test stricter
1 parent 8fadddc commit 9d6bea1

5 files changed

Lines changed: 153 additions & 62 deletions

File tree

openml/flows/functions.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,22 @@ def _list_flows(api_call):
127127
return flows
128128

129129

130+
def _check_flow_for_server_id(flow):
131+
"""Check if the given flow and it's components have a flow_id."""
132+
133+
# Depth-first search to check if all components were uploaded to the
134+
# server before parsing the parameters
135+
stack = list()
136+
stack.append(flow)
137+
while len(stack) > 0:
138+
current = stack.pop()
139+
if current.flow_id is None:
140+
raise ValueError("Flow %s has no flow_id!" % current.name)
141+
else:
142+
for component in current.components.values():
143+
stack.append(component)
144+
145+
130146
def assert_flows_equal(flow1, flow2):
131147
"""Check equality of two flows.
132148

openml/runs/run.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -160,36 +160,33 @@ def _create_description_xml(self):
160160
return description_xml
161161

162162
@staticmethod
163-
def _parse_parameters(flow):
163+
def _parse_parameters(flow, model=None):
164164
"""Extracts all parameter settings from the model inside a flow in
165165
OpenML format.
166166
167167
Parameters
168168
----------
169-
flow
169+
flow : OpenMLFlow
170170
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
171171
172+
model : BaseEstimator, optional
173+
If not given, the parameters are extracted from ``flow.model``.
174+
172175
"""
173176

174-
# Depth-first search to check if all components were uploaded to the
175-
# server before parsing the parameters
176-
stack = list()
177-
stack.append(flow)
178-
while len(stack) > 0:
179-
current = stack.pop()
180-
if current.flow_id is None:
181-
raise ValueError("Flow %s has no flow_id!" % current.name)
182-
else:
183-
for component in current.components.values():
184-
stack.append(component)
177+
if model is None:
178+
model = flow.model
179+
180+
openml.flows.functions._check_flow_for_server_id(flow)
185181

186182
def get_flow_dict(_flow):
187183
flow_map = {_flow.name: _flow.flow_id}
188184
for subflow in _flow.components:
189185
flow_map.update(get_flow_dict(_flow.components[subflow]))
190186
return flow_map
191187

192-
def extract_parameters(_flow, _flow_dict, _main_call=False, main_id=None):
188+
def extract_parameters(_flow, _flow_dict, component_model,
189+
_main_call=False, main_id=None):
193190
# _flow is openml flow object, _param dict maps from flow name to flow id
194191
# for the main call, the param dict can be overridden (useful for unit tests / sentinels)
195192
# this way, for flows without subflows we do not have to rely on _flow_dict
@@ -198,7 +195,8 @@ def extract_parameters(_flow, _flow_dict, _main_call=False, main_id=None):
198195
_current = OrderedDict()
199196
_current['oml:name'] = _param_name
200197

201-
_tmp = openml.flows.sklearn_to_flow(_flow.model.get_params()[_param_name])
198+
_tmp = openml.flows.sklearn_to_flow(
199+
component_model.get_params()[_param_name])
202200

203201
# Try to filter out components which are handled further down!
204202
if isinstance(_tmp, openml.flows.OpenMLFlow):
@@ -222,14 +220,18 @@ def extract_parameters(_flow, _flow_dict, _main_call=False, main_id=None):
222220
_params.append(_current)
223221

224222
for _identifier in _flow.components:
225-
_params.extend(extract_parameters(_flow.components[_identifier], _flow_dict))
223+
subcomponent_model = component_model.get_params()[_identifier]
224+
_params.extend(extract_parameters(_flow.components[_identifier],
225+
_flow_dict, subcomponent_model))
226226
return _params
227227

228228
flow_dict = get_flow_dict(flow)
229-
parameters = extract_parameters(flow, flow_dict, True, flow.flow_id)
229+
parameters = extract_parameters(flow, flow_dict, model,
230+
True, flow.flow_id)
230231

231232
return parameters
232233

234+
233235
################################################################################
234236
# Functions which cannot be in runs/functions due to circular imports
235237

openml/setups/functions.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,25 @@
22

33
import openml
44
import xmltodict
5-
import copy
65

76
from .setup import OpenMLSetup, OpenMLParameter
7+
from openml.flows import sklearn_to_flow, flow_exists
88

9-
def setup_exists(flow):
9+
10+
def setup_exists(flow, model=None):
1011
'''
11-
Checks whether a flow / hyperparameter configuration already exists on the server
12+
Checks whether a hyperparameter configuration already exists on the server.
1213
1314
Parameter
1415
---------
1516
1617
flow : flow
17-
the openml flow object (should be downloaded from server)
18-
sklearn_model : BaseEstimator
19-
The base estimator that was used to create the flow. Will
20-
be used to extract parameter settings from.
18+
The openml flow object.
19+
20+
sklearn_model : BaseEstimator, optional
21+
If given, the parameters are parsed from this model instead of the
22+
model in the flow. If not given, parameters are parsed from
23+
``flow.model``.
2124
2225
Returns
2326
-------
@@ -26,7 +29,18 @@ def setup_exists(flow):
2629
'''
2730

2831
# sadly, this api call relies on a run object
29-
openml_param_settings = openml.runs.OpenMLRun._parse_parameters(flow)
32+
openml.flows.functions._check_flow_for_server_id(flow)
33+
34+
if model is None:
35+
model = flow.model
36+
else:
37+
converted_flow = sklearn_to_flow(model)
38+
exists = flow_exists(converted_flow.name,
39+
converted_flow.external_version)
40+
if exists != flow.flow_id:
41+
raise ValueError('This should not happen!')
42+
43+
openml_param_settings = openml.runs.OpenMLRun._parse_parameters(flow, model)
3044
description = xmltodict.unparse(_to_dict(flow.flow_id,
3145
openml_param_settings),
3246
pretty=True)

tests/test_runs/test_run.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
2+
from sklearn.linear_model import LogisticRegression
3+
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
4+
5+
from openml.testing import TestBase
6+
from openml.flows.sklearn_converter import sklearn_to_flow
7+
from openml import OpenMLRun
8+
9+
10+
class TestRun(TestBase):
11+
12+
def test_parse_parameters_flow_not_on_server(self):
13+
14+
model = LogisticRegression()
15+
flow = sklearn_to_flow(model)
16+
self.assertRaisesRegexp(ValueError,
17+
'Flow sklearn.linear_model.logistic.LogisticRegression '
18+
'has no flow_id!',
19+
OpenMLRun._parse_parameters, flow)
20+
21+
model = AdaBoostClassifier(base_estimator=LogisticRegression())
22+
flow = sklearn_to_flow(model)
23+
flow.flow_id = 1
24+
self.assertRaisesRegexp(ValueError,
25+
'Flow sklearn.linear_model.logistic.LogisticRegression '
26+
'has no flow_id!',
27+
OpenMLRun._parse_parameters, flow)
28+
29+
def test_parse_parameters(self):
30+
31+
model = RandomizedSearchCV(
32+
estimator=RandomForestClassifier(n_estimators=5),
33+
param_distributions={"max_depth": [3, None],
34+
"max_features": [1, 2, 3, 4],
35+
"min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
36+
"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
37+
"bootstrap": [True, False],
38+
"criterion": ["gini", "entropy"]},
39+
cv=StratifiedKFold(n_splits=2, random_state=1),
40+
n_iter=5)
41+
flow = sklearn_to_flow(model)
42+
flow.flow_id = 1
43+
flow.components['estimator'].flow_id = 2
44+
parameters = OpenMLRun._parse_parameters(flow)
45+
for parameter in parameters:
46+
self.assertIsNotNone(parameter['oml:component'], msg=parameter)
47+
if parameter['oml:name'] == 'n_estimators':
48+
self.assertEqual(parameter['oml:value'], '5')
49+
self.assertEqual(parameter['oml:component'], 2)

tests/test_runs/test_run_functions.py

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@
3232
StratifiedKFold
3333
from sklearn.pipeline import Pipeline
3434

35-
if sys.version_info[0] >= 3:
36-
from unittest import mock
37-
else:
38-
import mock
39-
4035

4136
class TestRun(TestBase):
4237

@@ -219,34 +214,50 @@ def test_run_and_upload(self):
219214
num_folds = 1 # because of holdout
220215
num_iterations = 5 # for base search classifiers
221216

222-
clfs = [LogisticRegression(),
223-
Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
224-
('dummy', DummyClassifier(strategy='prior'))]),
225-
Pipeline(steps=[('Imputer', Imputer(strategy='median')),
226-
('VarianceThreshold', VarianceThreshold()),
227-
('Estimator', RandomizedSearchCV(
228-
DecisionTreeClassifier(),
229-
{'min_samples_split': [2 ** x for x in range(1, 7 + 1)],
230-
'min_samples_leaf': [2 ** x for x in range(0, 6 + 1)]},
231-
cv=3, n_iter=10))]),
232-
GridSearchCV(BaggingClassifier(base_estimator=SVC()),
233-
{"base_estimator__C": [0.01, 0.1, 10],
234-
"base_estimator__gamma": [0.01, 0.1, 10]}),
235-
RandomizedSearchCV(RandomForestClassifier(n_estimators=5),
236-
{"max_depth": [3, None],
237-
"max_features": [1, 2, 3, 4],
238-
"min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
239-
"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
240-
"bootstrap": [True, False],
241-
"criterion": ["gini", "entropy"]},
242-
cv=StratifiedKFold(n_splits=2,
243-
random_state=1),
244-
n_iter=num_iterations)]
245-
217+
clfs = []
218+
random_state_values = []
219+
220+
lr = LogisticRegression()
221+
clfs.append(lr)
222+
random_state_values.append('62501')
223+
224+
pipeline1 = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
225+
('dummy', DummyClassifier(strategy='prior'))])
226+
clfs.append(pipeline1)
227+
random_state_values.append('62501')
228+
229+
pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
230+
('VarianceThreshold', VarianceThreshold()),
231+
('Estimator', RandomizedSearchCV(
232+
DecisionTreeClassifier(),
233+
{'min_samples_split': [2 ** x for x in range(1, 7 + 1)],
234+
'min_samples_leaf': [2 ** x for x in range(0, 6 + 1)]},
235+
cv=3, n_iter=10))])
236+
clfs.append(pipeline2)
237+
random_state_values.append('62501')
238+
239+
gridsearch = GridSearchCV(BaggingClassifier(base_estimator=SVC()),
240+
{"base_estimator__C": [0.01, 0.1, 10],
241+
"base_estimator__gamma": [0.01, 0.1, 10]})
242+
clfs.append(gridsearch)
243+
random_state_values.append('62501')
244+
245+
randomsearch = RandomizedSearchCV(
246+
RandomForestClassifier(n_estimators=5),
247+
{"max_depth": [3, None],
248+
"max_features": [1, 2, 3, 4],
249+
"min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
250+
"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
251+
"bootstrap": [True, False],
252+
"criterion": ["gini", "entropy"]},
253+
cv=StratifiedKFold(n_splits=2, random_state=1),
254+
n_iter=num_iterations)
255+
256+
clfs.append(randomsearch)
246257
# The random states for the RandomizedSearchCV is set after the
247258
# random state of the RandomForestClassifier is set, therefore,
248259
# it has a different value than the other examples before
249-
random_state_values = ['62501'] * (len(clfs) - 1) + ['33003']
260+
random_state_values.append('33003')
250261

251262
for clf, rsv in zip(clfs, random_state_values):
252263
run = self._perform_run(task_id, num_test_instances, clf,
@@ -333,12 +344,11 @@ def test__run_exists(self):
333344
# and can just check their status on line
334345
clfs = [sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='mean')),
335346
('VarianceThreshold', VarianceThreshold(threshold=0.05)),
336-
('Estimator', GaussianNB())]),
347+
('Estimator', DecisionTreeClassifier(max_depth=4))]),
337348
sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='most_frequent')),
338349
('VarianceThreshold', VarianceThreshold(threshold=0.1)),
339350
('Estimator', DecisionTreeClassifier(max_depth=4))])]
340351

341-
342352
task = openml.tasks.get_task(115)
343353

344354
for clf in clfs:
@@ -347,18 +357,18 @@ def test__run_exists(self):
347357
# skip run if it was already performed.
348358
run = openml.runs.run_model_on_task(task, clf, avoid_duplicate_runs=True)
349359
run.publish()
350-
except openml.exceptions.PyOpenMLError:
360+
except openml.exceptions.PyOpenMLError as e:
351361
# run already existed. Great.
352362
pass
353363

354364
flow = openml.flows.sklearn_to_flow(clf)
355365
flow_exists = openml.flows.flow_exists(flow.name, flow.external_version)
356-
self.assertIsInstance(flow_exists, int)
366+
self.assertGreater(flow_exists, 0)
357367
downloaded_flow = openml.flows.get_flow(flow_exists)
358-
setup_exists = openml.setups.setup_exists(downloaded_flow)
359-
self.assertIsInstance(setup_exists, int)
368+
setup_exists = openml.setups.setup_exists(downloaded_flow, clf)
369+
self.assertGreater(setup_exists, 0)
360370
run_ids = _run_exists(task.task_id, setup_exists)
361-
self.assertGreater(len(run_ids), 0)
371+
self.assertTrue(run_ids, msg=(run_ids, clf))
362372

363373
def test__get_seeded_model(self):
364374
# randomized models that are initialized without seeds, can be seeded

0 commit comments

Comments
 (0)