Skip to content

Commit d09fe4f

Browse files
committed
MAINT add docstrings and comments
1 parent bd0175a commit d09fe4f

2 files changed

Lines changed: 233 additions & 82 deletions

File tree

openml/flows/flow.py

Lines changed: 139 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,56 @@
1111
class OpenMLFlow(object):
1212
"""OpenML Flow. Stores machine learning models.
1313
14+
Flows should not be generated manually, but by the function
15+
:meth:`openml.flows.create_flow_from_model`. Using this helper function
16+
ensures that all relevant fields are filled in.
17+
1418
Parameters
1519
----------
16-
model : scikit-learn compatible model
17-
The model the flow consists of. The model needs to have fit and predict methods.
18-
description : string
20+
name : str
21+
Name of the flow. Is used together with the attribute `external_version`
22+
as a unique identifier of the flow.
23+
description : str
1924
Description of the flow (free text).
20-
contributor : string
21-
FIXME
22-
tag : string
23-
FIXME
25+
model : object
26+
ML model which is described by this flow.
27+
components : OrderedDict
28+
Mapping from component identifier to an OpenMLFlow object.
29+
parameters : OrderedDict
30+
Mapping from parameter name to the parameter default value. The
31+
parameter default value must be of type `str`, so that the respective
32+
toolbox plugin can take care of casting the parameter default value to
33+
the correct type.
34+
parameters_meta_info : OrderedDict
35+
Mapping from parameter name to `dict`. Stores additional information for
36+
each parameter. Required keys are `data_type` and `description`.
37+
external_version : str
38+
Version number of the software the flow is implemented in. Is used
39+
together with the attribute `name` as a uniquer identifier of the flow.
40+
uploader : str
41+
OpenML user ID of the uploader. Filled in by the server.
42+
tags : list
43+
List of tags. Created on the server by other API calls.
44+
binary_url : str
45+
??? - don't use - implemented because it is used by flows on the server
46+
binary_format : str
47+
??? - don't use - implemented because it is used by flows on the server
48+
binary_md5 : str
49+
??? - don't use - implemented because it is used by flows on the server
50+
version : str
51+
OpenML version of the flow.
52+
upload_date : str
53+
Date the flow was uploaded. Filled in by the server.
54+
language : str
55+
Natural language the flow is described in (not the programming
56+
language).
57+
dependencies : str
58+
A list of dependencies necessary to run the flow.
2459
flow_id : int, optional
2560
Flow ID. Assigned by the server (fixme shouldn't be here?)
26-
uploader : string, optional
27-
User uploading the model (fixme shouldn't be here?). Assigned by the server.
28-
29-
3061
"""
62+
# TODO @Jan can you find better descriptions for binary_url, binary_md5,
63+
# binary_format and version?
3164
def __init__(self, name, description=None, model=None, components=None,
3265
parameters=None, parameters_meta_info=None,
3366
external_version=None, uploader=None, tags=None,
@@ -41,29 +74,34 @@ def __init__(self, name, description=None, model=None, components=None,
4174
if components is None:
4275
components = OrderedDict()
4376
elif not isinstance(components, OrderedDict):
44-
raise TypeError('components must be of type OrderedDict, but is %s.' %
45-
type(components))
77+
raise TypeError('components must be of type OrderedDict, '
78+
'but is %s.' % type(components))
4679
self.components = components
80+
4781
if parameters is None:
4882
parameters = OrderedDict()
4983
elif not isinstance(parameters, OrderedDict):
50-
raise TypeError('parameters must be of type OrderedDict, but is %s.' %
51-
type(parameters))
84+
raise TypeError('parameters must be of type OrderedDict, '
85+
'but is %s.' % type(parameters))
86+
5287
if parameters_meta_info is None:
5388
parameters_meta_info = OrderedDict()
5489
elif not isinstance(parameters_meta_info, OrderedDict):
55-
raise TypeError('parameters_meta_info must be of type OrderedDict, but is %s.' %
56-
type(parameters_meta_info))
90+
raise TypeError('parameters_meta_info must be of type OrderedDict, '
91+
'but is %s.' % type(parameters_meta_info))
92+
5793
keys_parameters = set(parameters.keys())
5894
keys_parameters_meta_info = set(parameters_meta_info.keys())
5995
if len(keys_parameters.difference(keys_parameters_meta_info)) > 0:
6096
raise ValueError('Parameter %s only in parameters, but not in'
6197
'parameters_meta_info.' %
62-
str(keys_parameters.difference(keys_parameters_meta_info)))
98+
str(keys_parameters.difference(
99+
keys_parameters_meta_info)))
63100
if len(keys_parameters_meta_info.difference(keys_parameters)) > 0:
64-
raise ValueError('Parameter %s only in parameters_meta_info, but not in'
65-
'parameters.' %
66-
str(keys_parameters_meta_info.difference(keys_parameters)))
101+
raise ValueError('Parameter %s only in parameters_meta_info, '
102+
'but not in parameters.' %
103+
str(keys_parameters_meta_info.difference(
104+
keys_parameters)))
67105

68106
self.parameters = parameters
69107
self.parameters_meta_info = parameters_meta_info
@@ -88,7 +126,7 @@ def _to_xml(self):
88126
89127
Returns
90128
-------
91-
flow_xml : string
129+
str
92130
Flow represented as XML string.
93131
"""
94132
flow_dict = self.__to_dict()
@@ -99,6 +137,17 @@ def _to_xml(self):
99137
return flow_xml
100138

101139
def __to_dict(self):
140+
""" Helper function used by _to_xml and __to_dict.
141+
142+
Creates a dictionary representation of self which can be serialized
143+
to xml by the function _to_xml.
144+
145+
Returns
146+
-------
147+
OrderedDict
148+
Flow represented as OrderedDict.
149+
150+
"""
102151
flow_dict = OrderedDict()
103152
flow_dict['oml:flow'] = OrderedDict()
104153
flow_dict['oml:flow']['@xmlns:oml'] = 'http://openml.org/openml'
@@ -122,17 +171,21 @@ def __to_dict(self):
122171
for key in self.parameters:
123172
param_dict = OrderedDict()
124173
param_dict['oml:name'] = key
174+
125175
if self.parameters_meta_info[key]['data_type'] is not None:
126-
param_dict['oml:data_type'] = self.parameters_meta_info[key].get('data_type')
176+
param_dict['oml:data_type'] = self.parameters_meta_info[key].\
177+
get('data_type')
178+
127179
param_dict['oml:default_value'] = self.parameters[key]
128180
if self.parameters_meta_info[key]['description'] is not None:
129-
param_dict['oml:description'] = self.parameters_meta_info[key].get('description')
181+
param_dict['oml:description'] = self.parameters_meta_info[key].\
182+
get('description')
130183

131-
for key, value in param_dict.items():
132-
if key is not None and not isinstance(key, six.string_types):
184+
for key_, value in param_dict.items():
185+
if key_ is not None and not isinstance(key_, six.string_types):
133186
raise ValueError('Parameter name %s cannot be serialized '
134187
'because it is of type %s. Only strings '
135-
'can be serialized.' % (key, type(key)))
188+
'can be serialized.' % (key_, type(key_)))
136189
if value is not None and not isinstance(value, six.string_types):
137190
raise ValueError('Parameter value %s cannot be serialized '
138191
'because it is of type %s. Only strings '
@@ -146,15 +199,16 @@ def __to_dict(self):
146199
for key in self.components:
147200
component_dict = OrderedDict()
148201
component_dict['oml:identifier'] = key
149-
component_dict['oml:flow'] = self.components[key].__to_dict()['oml:flow']
202+
component_dict['oml:flow'] = \
203+
self.components[key].__to_dict()['oml:flow']
150204

151-
for key in component_dict:
152-
# We can only check the key here, because the value is a flow.
153-
# The flow itself has to be valid by recursion
154-
if key is not None and not isinstance(key, six.string_types):
205+
for key_ in component_dict:
206+
# We only need to check if the key is a string, because the
207+
# value is a flow. The flow itself is valid by recursion
208+
if key_ is not None and not isinstance(key_, six.string_types):
155209
raise ValueError('Parameter name %s cannot be serialized '
156210
'because it is of type %s. Only strings '
157-
'can be serialized.' % (key, type(key)))
211+
'can be serialized.' % (key_, type(key_)))
158212

159213
components.append(component_dict)
160214

@@ -173,6 +227,18 @@ def __to_dict(self):
173227

174228
@classmethod
175229
def _from_xml(cls, xml_dict):
230+
"""Create a flow from an xml description.
231+
232+
Parameters
233+
----------
234+
xml_dict : dict
235+
Dictionary representation of the flow as created by _to_dict()
236+
237+
Returns
238+
-------
239+
OpenMLFlow
240+
241+
"""
176242
dic = xml_dict["oml:flow"]
177243
flow_id = int(dic['oml:id']) if 'oml:id' in dic else None
178244
uploader = dic.get('oml:uploader')
@@ -237,7 +303,11 @@ def _from_xml(cls, xml_dict):
237303
flow_id=flow_id)
238304

239305
def __eq__(self, other):
240-
"""Override the default Equals behavior"""
306+
"""Check equality.
307+
308+
Two flows are equal if their all keys which are not set by the server
309+
are equal, as well as all their parameters and components.
310+
"""
241311
if isinstance(other, self.__class__):
242312
this_dict = self.__dict__.copy()
243313
this_parameters = this_dict['parameters']
@@ -253,7 +323,8 @@ def __eq__(self, other):
253323
del other_dict['components']
254324
del other_dict['model']
255325

256-
# Name is actually not generated by the server, but it will be tested further down with a getter (allows mocking)
326+
# Name is actually not generated by the server, but it will be
327+
# tested further down with a getter (allows mocking in the tests)
257328
generated_by_the_server = ['name', 'flow_id', 'uploader', 'version',
258329
'upload_date', 'source_url',
259330
'binary_url', 'source_format',
@@ -267,14 +338,18 @@ def __eq__(self, other):
267338
equal = this_dict == other_dict
268339
equal_name = self._get_name() == other._get_name()
269340

270-
parameters_equal = this_parameters.keys() == other_parameters.keys() and \
271-
all([this_parameter == other_parameter
272-
for this_parameter, other_parameter in
273-
zip(this_parameters.values(), other_parameters.values())])
274-
components_equal = this_components.keys() == other_components.keys() and \
275-
all([this_component == other_component
276-
for this_component, other_component in
277-
zip(this_components.values(), other_components.values())])
341+
parameters_equal = \
342+
this_parameters.keys() == other_parameters.keys() and \
343+
all([this_parameter == other_parameter
344+
for this_parameter, other_parameter in
345+
zip(this_parameters.values(),
346+
other_parameters.values())])
347+
components_equal = \
348+
this_components.keys() == other_components.keys() and \
349+
all([this_component == other_component
350+
for this_component, other_component in
351+
zip(this_components.values(),
352+
other_components.values())])
278353

279354
return parameters_equal and components_equal and equal and equal_name
280355
return NotImplemented
@@ -333,6 +408,27 @@ def _get_name(self):
333408

334409

335410
def create_flow_from_model(model, converter, description=None):
411+
"""Use a converter to create an OpenMLFlow from model.
412+
413+
Allows to configure how a model (for example a scikit-learn estimator) is
414+
transformed into an OpenMLFlow.
415+
416+
Parameters
417+
----------
418+
model : object
419+
ML model. Must match the converter.
420+
converter : object
421+
Class that implements a method `flow = serialize_object(model)`.
422+
Abstract interface to come soon.
423+
description : str, optional
424+
Provide a description of the flow, overwriting the default description
425+
generated by the converter.
426+
427+
Returns
428+
-------
429+
OpenMLFlow
430+
431+
"""
336432
flow = converter.serialize_object(model)
337433
if not isinstance(flow, OpenMLFlow):
338434
raise ValueError('Converter %s did return %s, not OpenMLFlow!' %

0 commit comments

Comments
 (0)