Skip to content

Commit 89a3dd4

Browse files
authored
Merge pull request #241 from openml/fix169
Fix169
2 parents 65e8758 + 85472cc commit 89a3dd4

9 files changed

Lines changed: 302 additions & 57 deletions

File tree

openml/runs/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .run import OpenMLRun
2-
from .functions import (run_task, get_run, list_runs, get_runs)
2+
from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run)
33

44
__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs']

openml/runs/functions.py

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
import warnings
77
import sklearn
88
import time
9-
from sklearn.model_selection._search import BaseSearchCV
109

1110
from ..exceptions import PyOpenMLError
1211
from .. import config
12+
1313
from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
14-
from ..setups import setup_exists
14+
from ..setups import setup_exists, initialize_model
15+
1516
from ..exceptions import OpenMLCacheException, OpenMLServerException
1617
from ..util import URLError, version_complies
17-
from ..tasks.functions import _create_task_from_xml
1818
from .._api_calls import _perform_api_call
1919
from .run import OpenMLRun, _get_version_information
2020

@@ -24,7 +24,7 @@
2424

2525

2626

27-
def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
27+
def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None):
2828
"""Performs a CV run on the dataset of the given task, using the split.
2929
3030
Parameters
@@ -35,8 +35,13 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
3535
a model which has a function fit(X,Y) and predict(X),
3636
all supervised estimators of scikit learn follow this definition of a model [1]
3737
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
38+
avoid_duplicate_runs : bool
39+
if this flag is set to True, the run will throw an error if the
40+
setup/task combination is already present on the server.
3841
flow_tags : list(str)
3942
a list of tags that the flow should have at creation
43+
seed: int
44+
the models that are not seeded will get this seed
4045
4146
Returns
4247
-------
@@ -48,6 +53,7 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
4853
# TODO move this into its onwn module. While it somehow belongs here, it
4954
# adds quite a lot of functionality which is better suited in other places!
5055
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
56+
model = _get_seeded_model(model, seed)
5157
flow = sklearn_to_flow(model)
5258

5359
# returns flow id if the flow exists on the server, False otherwise
@@ -88,6 +94,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
8894

8995
return run
9096

97+
def initialize_model_from_run(run_id):
98+
'''
99+
Initialized a model based on a run_id (i.e., using the exact
100+
same parameter settings)
101+
102+
Parameters
103+
----------
104+
run_id : int
105+
The Openml run_id
106+
107+
Returns
108+
-------
109+
model : sklearn model
110+
the scikitlearn model with all parameters initailized
111+
'''
112+
run = get_run(run_id)
113+
return initialize_model(run.setup_id)
114+
91115
def _run_exists(task_id, setup_id):
92116
'''
93117
Checks whether a task/setup combination is already present on the server.
@@ -111,6 +135,49 @@ def _run_exists(task_id, setup_id):
111135
assert(exception.code == 512)
112136
return False
113137

138+
def _get_seeded_model(model, seed=None):
139+
'''Sets all the non-seeded components of a model with a seed.
140+
Models that are already seeded will maintain the seed. In
141+
this case, only integer seeds are allowed (An exception
142+
is thrown when a RandomState was used as seed)
143+
144+
Parameters
145+
----------
146+
model : sklearn model
147+
The model to be seeded
148+
seed : int
149+
The seed to initialize the RandomState with. Unseeded subcomponents
150+
will be seeded with a random number from the RandomState.
151+
152+
Returns
153+
-------
154+
model : sklearn model
155+
a version of the model where all (sub)components have
156+
a seed
157+
'''
158+
159+
rs = np.random.RandomState(seed)
160+
model_params = model.get_params()
161+
random_states = {}
162+
for param_name in sorted(model_params):
163+
if 'random_state' in param_name:
164+
currentValue = model_params[param_name]
165+
# important to draw the value at this point (and not in the if statement)
166+
# this way we guarantee that if a different set of subflows is seeded,
167+
# the same number of the random generator is used
168+
newValue = rs.randint(0, 2**16)
169+
if currentValue is None:
170+
random_states[param_name] = newValue
171+
elif isinstance(currentValue, int):
172+
# acceptable behaviour
173+
pass
174+
elif isinstance(currentValue, np.random.RandomState):
175+
raise ValueError('Models initialized with a RandomState object are not supported. Please seed with an integer. ')
176+
else:
177+
raise ValueError('Models should be seeded with int or None (this should never happen). ')
178+
model.set_params(**random_states)
179+
return model
180+
114181

115182

116183
def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,

openml/runs/run.py

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def _create_description_xml(self):
165165
return description_xml
166166

167167
@staticmethod
168-
def _parse_parameters(model, flow):
168+
def _parse_parameters(model, server_flow):
169169
"""Extracts all parameter settings from a model in OpenML format.
170170
171171
Parameters
@@ -176,50 +176,38 @@ def _parse_parameters(model, flow):
176176
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
177177
178178
"""
179-
if flow.flow_id is None:
179+
if server_flow.flow_id is None:
180180
raise ValueError("The flow parameter needs to be downloaded from server")
181181

182-
python_param_settings = model.get_params()
183-
openml_param_settings = []
184-
185182
def get_flow_dict(_flow):
186183
flow_map = {_flow.name: _flow.flow_id}
187184
for subflow in _flow.components:
188185
flow_map.update(get_flow_dict(_flow.components[subflow]))
189186
return flow_map
190187

191-
flow_dict = get_flow_dict(flow)
192-
193-
for param in python_param_settings:
194-
if "__" in param:
195-
# parameter of subflow. will be handled later
196-
continue
197-
if isinstance(python_param_settings[param], BaseEstimator):
198-
# extract parameters of the subflow individually
199-
subflow = flow.components[param]
200-
openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow)
201-
202-
# add parameter setting (in some cases also the subflow. Just because we can)
203-
if param in flow.parameters.keys():
204-
param_dict = OrderedDict()
205-
param_dict['oml:name'] = param
206-
param_dict['oml:value'] = str(python_param_settings[param])
207-
param_dict['oml:component'] = flow_dict[flow.name]
208-
openml_param_settings.append(param_dict)
209-
else:
210-
if flow.name.startswith("sklearn.pipeline.Pipeline"):
211-
# tolerate
212-
pass
213-
elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
214-
# tolerate
215-
pass
216-
elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"):
217-
# tolerate
218-
pass
188+
def extract_parameters(_flow, _param_dict, _main_call=False, main_id=None):
189+
# _flow is openml flow object, _param dict maps from flow name to flow id
190+
# for the main call, the param dict can be overridden (useful for unit tests / sentinels)
191+
# this way, for flows without subflows we do not have to rely on _param_dict
192+
_params = []
193+
for _param_name in _flow.parameters:
194+
_current = OrderedDict()
195+
_current['oml:name'] = _param_name
196+
_current['oml:value'] = _flow.parameters[_param_name]
197+
if _main_call:
198+
_current['oml:component'] = main_id
219199
else:
220-
raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
200+
_current['oml:component'] = _param_dict[_flow.name]
201+
_params.append(_current)
202+
for _identifier in _flow.components:
203+
_params.extend(extract_parameters(_flow.components[_identifier], _param_dict))
204+
return _params
205+
206+
flow_dict = get_flow_dict(server_flow)
207+
local_flow = openml.flows.sklearn_to_flow(model)
221208

222-
return openml_param_settings
209+
parameters = extract_parameters(local_flow, flow_dict, True, server_flow.flow_id)
210+
return parameters
223211

224212
################################################################################
225213
# Functions which cannot be in runs/functions due to circular imports

openml/setups/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from .functions import setup_exists
1+
from .functions import get_setup, setup_exists, initialize_model
22

3-
__all__ = ['setup_exists']
3+
__all__ = ['get_setup', 'setup_exists', 'initialize_model']

openml/setups/functions.py

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import openml
22
import xmltodict
3+
import copy
34

45
from collections import OrderedDict
6+
from .setup import OpenMLSetup, OpenMLParameter
57

68
def setup_exists(downloaded_flow, sklearn_model):
79
'''
@@ -34,14 +36,118 @@ def setup_exists(downloaded_flow, sklearn_model):
3436
if setup_id > 0:
3537
return setup_id
3638
else:
37-
return False;
39+
return False
40+
41+
42+
def get_setup(setup_id):
43+
'''
44+
Downloads the setup (configuration) description from OpenML
45+
and returns a structured object
46+
47+
Parameters
48+
----------
49+
setup_id : int
50+
The Openml setup_id
51+
52+
Returns
53+
-------
54+
OpenMLSetup
55+
an initialized openml setup object
56+
'''
57+
result = openml._api_calls._perform_api_call('/setup/%d' %setup_id)
58+
result_dict = xmltodict.parse(result)
59+
return _create_setup_from_xml(result_dict)
60+
61+
62+
def initialize_model(setup_id):
63+
'''
64+
Initialized a model based on a setup_id (i.e., using the exact
65+
same parameter settings)
66+
67+
Parameters
68+
----------
69+
setup_id : int
70+
The Openml setup_id
71+
72+
Returns
73+
-------
74+
model : sklearn model
75+
the scikitlearn model with all parameters initailized
76+
'''
77+
def _to_dict_of_dicts(_params):
78+
# this subfunction transforms an openml setup object into
79+
# a dict of dicts, structured: flow_id maps to dict of
80+
# parameter_names mapping to parameter_value
81+
_res = {}
82+
for _param in _params:
83+
_flow_id = _params[_param].flow_id
84+
_param_name = _params[_param].parameter_name
85+
_param_value = _params[_param].value
86+
if _flow_id not in _res:
87+
_res[_flow_id] = {}
88+
_res[_flow_id][_param_name] = _param_value
89+
return _res
90+
91+
def _reconstruct_flow(_flow, _params):
92+
# sets the values of flow parameters (and subflows) to
93+
# the specific values from a setup. _params is a dict of
94+
# dicts, mapping from flow id to param name to param value
95+
# (obtained by using the subfunction _to_dict_of_dicts)
96+
for _param in _flow.parameters:
97+
_flow.parameters[_param] = _params[_flow.flow_id][_param]
98+
for _identifier in _flow.components:
99+
_flow.components[_identifier] = _reconstruct_flow(_flow.components[_identifier], _params)
100+
return _flow
101+
102+
setup = get_setup(setup_id)
103+
parameters = _to_dict_of_dicts(setup.parameters)
104+
flow = openml.flows.get_flow(setup.flow_id)
105+
106+
# now we 'abuse' the parameter object by passing in the
107+
# parameters obtained from the setup
108+
flow = _reconstruct_flow(flow, parameters)
109+
110+
return openml.flows.flow_to_sklearn(flow)
38111

39112

40113
def _to_dict(flow_id, openml_parameter_settings):
114+
# for convenience, this function (ab)uses the run object.
41115
xml = OrderedDict()
42116
xml['oml:run'] = OrderedDict()
43117
xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
44118
xml['oml:run']['oml:flow_id'] = flow_id
45119
xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings
46120

47-
return xml
121+
return xml
122+
123+
def _create_setup_from_xml(result_dict):
124+
'''
125+
Turns an API xml result into a OpenMLSetup object
126+
'''
127+
flow_id = int(result_dict['oml:setup_parameters']['oml:flow_id'])
128+
parameters = {}
129+
if 'oml:parameter' not in result_dict['oml:setup_parameters']:
130+
parameters = None
131+
else:
132+
# basically all others
133+
xml_parameters = result_dict['oml:setup_parameters']['oml:parameter']
134+
if isinstance(xml_parameters, dict):
135+
id = int(xml_parameters['oml:id'])
136+
parameters[id] = _create_setup_parameter_from_xml(xml_parameters)
137+
elif isinstance(xml_parameters, list):
138+
for xml_parameter in xml_parameters:
139+
id = int(xml_parameter['oml:id'])
140+
parameters[id] = _create_setup_parameter_from_xml(xml_parameter)
141+
else:
142+
raise ValueError('Expected None, list or dict, received someting else: %s' %str(type(xml_parameters)))
143+
144+
return OpenMLSetup(flow_id, parameters)
145+
146+
def _create_setup_parameter_from_xml(result_dict):
147+
return OpenMLParameter(int(result_dict['oml:id']),
148+
int(result_dict['oml:flow_id']),
149+
result_dict['oml:full_name'],
150+
result_dict['oml:parameter_name'],
151+
result_dict['oml:data_type'],
152+
result_dict['oml:default_value'],
153+
result_dict['oml:value'])

0 commit comments

Comments
 (0)