Skip to content

Commit 7e6a545

Browse files
committed
FIX/ENH naming problem, add more asserts to unit tests
1 parent 6bec1fa commit 7e6a545

4 files changed

Lines changed: 85 additions & 28 deletions

File tree

openml/flows/functions.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ..util import URLError
66

77

8-
def get_flow(flow_id):
8+
def get_flow(flow_id, converter=None):
99
"""Download the OpenML flow for a given flow ID.
1010
1111
Parameters
@@ -28,4 +28,9 @@ def get_flow(flow_id):
2828

2929
flow_dict = xmltodict.parse(flow_xml)
3030
flow = OpenMLFlow._from_xml(flow_dict)
31+
32+
if converter is not None:
33+
model = converter.deserialize_object(flow)
34+
flow.model = model
35+
3136
return flow

openml/flows/sklearn_converter.py

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,20 @@ def _serialize_model(self, model):
189189
for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
190190
rval = self.serialize_object(v)
191191

192+
193+
# In case of pipelines, or when having a component (for example
194+
# in the AdaBoostClassifier or the RandomizedSearchCV), the
195+
# parameters of that component are also returned by get_params()
196+
# This check makes sure that we only add the parameters for the
197+
# current component, and not the ones of the child component.
198+
# Parameters of child components will be added to the correct flow
199+
# when deserializing the subflow
200+
model_parameters = signature(model.__init__)
201+
if k not in model_parameters.parameters:
202+
continue
203+
192204
if isinstance(rval, (list, tuple)):
205+
193206
# Steps in a pipeline or feature union
194207
parameter_value = list()
195208
for identifier, sub_component in rval:
@@ -221,18 +234,6 @@ def _serialize_model(self, model):
221234
parameters[k] = parameter_value
222235

223236
elif isinstance(rval, OpenMLFlow):
224-
# Since serialize_object can return a Flow, we need to check
225-
# whether that flow represents a hyperparameter value of the
226-
# current flow or of a subcomponent. We only add it to the
227-
# parameters in the first case. In the second case, it will
228-
# be added to the correct flow when deserializing the subflow
229-
# (which happens either in the body of the if statement above
230-
# or in this body when the component is the value of a
231-
# hyperparameter, as it could be for example in the
232-
# AdaBoostClassifier.
233-
model_parameters = signature(model.__init__)
234-
if k not in model_parameters.parameters:
235-
continue
236237

237238
# A subcomponent, for example the base model in
238239
# AdaBoostClassifier
@@ -244,14 +245,6 @@ def _serialize_model(self, model):
244245
parameters[k] = json.dumps(component_reference)
245246

246247
else:
247-
# In case of pipelines, or when having a component (for example
248-
# in the AdaBoostClassifier or the RandomizedSearchCV), the
249-
# parameters of that component are also returned by get_params()
250-
# This check makes sure that we only add the parameters for the
251-
# current component, and not the ones of the child component.
252-
model_parameters = signature(model.__init__)
253-
if k not in model_parameters.parameters:
254-
continue
255248

256249
# a regular hyperparameter
257250
if not (hasattr(rval, '__len__') and len(rval) == 0):
@@ -289,7 +282,7 @@ def _serialize_model(self, model):
289282

290283
def _deserialize_model(self, flow, **kwargs):
291284

292-
model_name = flow.name
285+
model_name = flow._get_name()
293286
# Remove everything after the first bracket, it is not necessary for
294287
# creating the current flow
295288
pos = model_name.find('(')
@@ -434,6 +427,7 @@ def serialize_cross_validator(self, o):
434427
warnings.filters.pop(0)
435428

436429
if not (hasattr(value, '__len__') and len(value) == 0):
430+
value = json.dumps(value)
437431
parameters[key] = value
438432
else:
439433
parameters[key] = None

tests/flows/test_flow.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import xmltodict
88

99
import scipy.stats
10+
import sklearn.datasets
1011
import sklearn.dummy
1112
import sklearn.ensemble
1213
import sklearn.model_selection
@@ -123,17 +124,22 @@ def test_publish_flow(self, name_mock):
123124

124125
@mock.patch.object(openml.OpenMLFlow, '_get_name', autospec=True)
125126
def test_sklearn_to_upload_to_flow(self, name_mock):
127+
iris = sklearn.datasets.load_iris()
128+
X = iris.data
129+
y = iris.target
130+
126131
# Create a unique prefix for the flow. Necessary because the flow is
127132
# identified by its name and external version online. Having a unique
128133
# name allows us to publish the same flow in each test run
129134
md5 = hashlib.md5()
130135
md5.update(str(time.time()).encode('utf-8'))
131136
sentinel = md5.hexdigest()[:10]
137+
sentinel = 'TEST%s' % sentinel
132138
def side_effect(self):
133139
if sentinel in self.name:
134140
return self.name
135141
else:
136-
return 'TEST%s%s' % (sentinel, self.name)
142+
return '%s%s' % (sentinel, self.name)
137143
name_mock.side_effect = side_effect
138144

139145
# Test a more complicated flow
@@ -142,18 +148,32 @@ def side_effect(self):
142148
base_estimator=sklearn.tree.DecisionTreeClassifier())
143149
model = sklearn.pipeline.Pipeline(steps=(
144150
('scaler', scaler), ('boosting', boosting)))
145-
parameter_grid = {'n_estimators': [1, 5, 10, 100],
146-
'learning_rate': scipy.stats.uniform(0.01, 0.99),
147-
'base_estimator__max_depth': scipy.stats.randint(1, 10)}
151+
parameter_grid = {'boosting__n_estimators': [1, 5, 10, 100],
152+
'boosting__learning_rate': scipy.stats.uniform(0.01, 0.99),
153+
'boosting__base_estimator__max_depth': scipy.stats.randint(1, 10)}
154+
cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
148155
rs = sklearn.model_selection.RandomizedSearchCV(
149-
estimator=model, param_distributions=parameter_grid)
156+
estimator=model, param_distributions=parameter_grid, cv=cv)
157+
rs.fit(X, y)
150158
flow = openml.flows.create_flow_from_model(rs, SklearnToFlowConverter())
151159

152160
flow.publish()
153161
self.assertIsInstance(flow.flow_id, int)
154162

155163
# Check whether we can load the flow again
156-
new_flow = openml.flows.get_flow(flow_id=flow.flow_id)
164+
# Remove the sentinel from the name again so that we can reinstantiate
165+
# the object again
166+
def side_effect(self):
167+
if sentinel in self.name:
168+
name = self.name.replace(sentinel, '')
169+
return name
170+
else:
171+
return self.name
172+
name_mock.side_effect = side_effect
173+
174+
name_mock.side_effect = side_effect
175+
new_flow = openml.flows.get_flow(flow_id=flow.flow_id,
176+
converter=SklearnToFlowConverter())
157177

158178
local_xml = flow._to_xml()
159179
server_xml = new_flow._to_xml()
@@ -175,4 +195,14 @@ def side_effect(self):
175195

176196
self.assertEqual(new_flow, flow)
177197
self.assertIsNot(new_flow, flow)
198+
new_flow.model.fit(X, y)
199+
200+
fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
201+
'sklearn.model_selection._split.StratifiedKFold,' \
202+
'sklearn.pipeline.Pipeline(' \
203+
'sklearn.preprocessing.data.StandardScaler,' \
204+
'sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
205+
'sklearn.tree.tree.DecisionTreeClassifier)))'
206+
207+
self.assertEqual(new_flow._get_name(), fixture_name)
178208

tests/flows/test_sklearn.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,10 @@ def test_serialize_feature_union(self):
163163
fu = sklearn.pipeline.FeatureUnion(transformer_list=(('ohe', ohe),
164164
('scaler', scaler)))
165165
serialization = self.converter.serialize_object(fu)
166+
self.assertEqual(serialization.name,
167+
'sklearn.pipeline.FeatureUnion('
168+
'sklearn.preprocessing.data.OneHotEncoder,'
169+
'sklearn.preprocessing.data.StandardScaler)')
166170
new_model = self.converter.deserialize_object(serialization)
167171

168172
self.assertEqual(type(new_model), type(fu))
@@ -193,6 +197,30 @@ def test_serialize_feature_union(self):
193197
self.assertEqual(new_model_params, fu_params)
194198
new_model.fit(self.X, self.y)
195199

200+
def test_serialize_complex_flow(self):
201+
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
202+
203+
boosting = sklearn.ensemble.AdaBoostClassifier(
204+
base_estimator=sklearn.tree.DecisionTreeClassifier())
205+
model = sklearn.pipeline.Pipeline(steps=(
206+
('scaler', scaler), ('boosting', boosting)))
207+
parameter_grid = {'n_estimators': [1, 5, 10, 100],
208+
'learning_rate': scipy.stats.uniform(0.01, 0.99),
209+
'base_estimator__max_depth': scipy.stats.randint(1,
210+
10)}
211+
cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
212+
rs = sklearn.model_selection.RandomizedSearchCV(
213+
estimator=model, param_distributions=parameter_grid, cv=cv)
214+
serialized = self.converter.serialize_object(rs)
215+
216+
fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
217+
'sklearn.model_selection._split.StratifiedKFold,' \
218+
'sklearn.pipeline.Pipeline(' \
219+
'sklearn.preprocessing.data.StandardScaler,' \
220+
'sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
221+
'sklearn.tree.tree.DecisionTreeClassifier)))'
222+
self.assertEqual(serialized.name, fixture_name)
223+
196224
def test_serialize_type(self):
197225
supported_types = [float, np.float, np.float32, np.float64,
198226
int, np.int, np.int32, np.int64]

0 commit comments

Comments
 (0)