Skip to content

Commit 4b15e84

Browse files
authored
Merge pull request #223 from openml/issue210
Issue210
2 parents f4df535 + c23b113 commit 4b15e84

3 files changed

Lines changed: 71 additions & 52 deletions

File tree

openml/runs/functions.py

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
import xmltodict
55
import numpy as np
66
import warnings
7-
import openml
8-
from sklearn.model_selection._search import BaseSearchCV
7+
import sklearn
98

109
from ..exceptions import PyOpenMLError
1110
from .. import config
@@ -59,7 +58,6 @@ def run_task(task, model, avoid_duplicate_runs=True):
5958
raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))
6059

6160
dataset = task.get_dataset()
62-
X, Y = dataset.get_data(target=task.target_name)
6361

6462
class_labels = task.class_labels
6563
if class_labels is None:
@@ -68,19 +66,19 @@ def run_task(task, model, avoid_duplicate_runs=True):
6866

6967
# execute the run
7068
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
71-
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
69+
run.data_content, run.trace_content, run.trace_attributes = _run_task_get_arffcontent(model, task, class_labels)
70+
7271

7372
if flow_id == False:
74-
# means the flow did not exists.
75-
# As we could run it, publish it now
73+
# means the flow did not exists. As we could run it, publish it now
7674
flow = flow.publish()
7775
else:
7876
# flow already existed, download it from server
7977
# TODO (neccessary? is this a post condition of this function)
8078
flow = get_flow(flow_id)
8179

8280
run.flow_id = flow.flow_id
83-
config.logger.info('Executed Task %d with Flow id: %d' %(task.task_id, run.flow_id))
81+
config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
8482

8583
return run
8684

@@ -160,22 +158,27 @@ def _run_task_get_arffcontent(model, task, class_labels):
160158
for rep in task.iterate_repeats():
161159
fold_no = 0
162160
for fold in rep:
161+
model_fold = sklearn.base.clone(model, safe=True)
163162
train_indices, test_indices = fold
164163
trainX = X[train_indices]
165164
trainY = Y[train_indices]
166165
testX = X[test_indices]
167166
testY = Y[test_indices]
168167

169-
model.fit(trainX, trainY)
168+
try:
169+
model_fold.fit(trainX, trainY)
170170

171-
if isinstance(model, BaseSearchCV):
172-
_add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no)
173-
model_classes = model.best_estimator_.classes_
174-
else:
175-
model_classes = model.classes_
171+
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
172+
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
173+
model_classes = model_fold.best_estimator_.classes_
174+
else:
175+
model_classes = model_fold.classes_
176+
except AttributeError as e:
177+
# typically happens when training a regressor on classification task
178+
raise PyOpenMLError(str(e))
176179

177-
ProbaY = model.predict_proba(testX)
178-
PredY = model.predict(testX)
180+
ProbaY = model_fold.predict_proba(testX)
181+
PredY = model_fold.predict(testX)
179182
if ProbaY.shape[1] != len(class_labels):
180183
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
181184

@@ -186,13 +189,18 @@ def _run_task_get_arffcontent(model, task, class_labels):
186189
fold_no = fold_no + 1
187190
rep_no = rep_no + 1
188191

189-
if not isinstance(model, BaseSearchCV):
192+
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
193+
# arff_tracecontent is already set
194+
arff_trace_attributes = _extract_arfftrace_attributes(model_fold)
195+
else:
190196
arff_tracecontent = None
197+
arff_trace_attributes = None
191198

192-
return arff_datacontent, arff_tracecontent
199+
return arff_datacontent, arff_tracecontent, arff_trace_attributes
193200

194201

195-
def _add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no):
202+
def _extract_arfftrace(model, rep_no, fold_no):
203+
arff_tracecontent = []
196204
for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
197205
# we use the string values for True and False, as it is defined in this way by the OpenML server
198206
selected = 'false'
@@ -204,6 +212,30 @@ def _add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no):
204212
if key.startswith("param_"):
205213
arff_line.append(str(model.cv_results_[key][itt_no]))
206214
arff_tracecontent.append(arff_line)
215+
return arff_tracecontent
216+
217+
def _extract_arfftrace_attributes(model):
218+
# attributes that will be in trace arff, regardless of the model
219+
trace_attributes = [('repeat', 'NUMERIC'),
220+
('fold', 'NUMERIC'),
221+
('iteration', 'NUMERIC'),
222+
('evaluation', 'NUMERIC'),
223+
('selected', ['true', 'false'])]
224+
225+
# model dependent attributes for trace arff
226+
for key in model.cv_results_:
227+
if key.startswith("param_"):
228+
if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
229+
type = ['True', 'False']
230+
elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
231+
type = 'NUMERIC'
232+
else:
233+
values = list(set(model.cv_results_[key])) # unique values
234+
type = [str(i) for i in values]
235+
236+
attribute = ("parameter_" + key[6:], type)
237+
trace_attributes.append(attribute)
238+
return trace_attributes
207239

208240

209241
def get_runs(run_ids):
@@ -306,9 +338,16 @@ def _create_run_from_xml(xml):
306338
dataset_id = int(run['oml:input_data']['oml:dataset']['oml:did'])
307339

308340
predictions_url = None
309-
for file_dict in run['oml:output_data']['oml:file']:
341+
if isinstance(run['oml:output_data']['oml:file'], dict):
342+
# only one result.. probably due to an upload error
343+
file_dict = run['oml:output_data']['oml:file']
310344
if file_dict['oml:name'] == 'predictions':
311345
predictions_url = file_dict['oml:url']
346+
else:
347+
# multiple files, the normal case
348+
for file_dict in run['oml:output_data']['oml:file']:
349+
if file_dict['oml:name'] == 'predictions':
350+
predictions_url = file_dict['oml:url']
312351
if predictions_url is None:
313352
raise ValueError('No URL to download predictions for run %d in run '
314353
'description XML' % run_id)

openml/runs/run.py

Lines changed: 9 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import arff
66
import xmltodict
77
from sklearn.base import BaseEstimator
8-
from sklearn.model_selection._search import BaseSearchCV
98

109
import openml
1110
from ..tasks import get_task
@@ -23,8 +22,8 @@ class OpenMLRun(object):
2322
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
2423
files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
2524
evaluations=None, detailed_evaluations=None,
26-
data_content=None, trace_content=None, model=None, task_type=None,
27-
task_evaluation_measure=None, flow_name=None,
25+
data_content=None, trace_attributes=None, trace_content=None,
26+
model=None, task_type=None, task_evaluation_measure=None, flow_name=None,
2827
parameter_settings=None, predictions_url=None, task=None,
2928
flow=None, run_id=None):
3029
self.uploader = uploader
@@ -42,6 +41,7 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
4241
self.evaluations = evaluations
4342
self.detailed_evaluations = detailed_evaluations
4443
self.data_content = data_content
44+
self.trace_attributes = trace_attributes
4545
self.trace_content = trace_content
4646
self.error_message = None
4747
self.task = task
@@ -80,7 +80,7 @@ def _generate_arff_dict(self):
8080
arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
8181
return arff_dict
8282

83-
def _generate_trace_arff_dict(self, model):
83+
def _generate_trace_arff_dict(self):
8484
"""Generates the arff dictionary for uploading predictions to the server.
8585
8686
Assumes that the run has been executed.
@@ -91,32 +91,13 @@ def _generate_trace_arff_dict(self, model):
9191
Dictionary representation of the ARFF file that will be uploaded.
9292
Contains information about the optimization trace.
9393
"""
94-
if self.trace_content is None:
94+
if self.trace_content is None or len(self.trace_content) == 0:
9595
raise ValueError('No trace content avaiable.')
96-
if not isinstance(model, BaseSearchCV):
97-
raise PyOpenMLError('Cannot generate trace on provided classifier. (This should never happen.)')
96+
if len(self.trace_attributes) != len(self.trace_content[0]):
97+
raise ValueError('Trace_attributes and trace_content not compatible')
9898

9999
arff_dict = {}
100-
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
101-
('fold', 'NUMERIC'),
102-
('iteration', 'NUMERIC'),
103-
('evaluation', 'NUMERIC'),
104-
('selected', ['true', 'false'])]
105-
for key in model.cv_results_:
106-
if key.startswith("param_"):
107-
type = 'STRING'
108-
if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
109-
type = ['True', 'False']
110-
elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
111-
type = 'NUMERIC'
112-
else:
113-
values = list(set(model.cv_results_[key])) # unique values
114-
type = [str(i) for i in values]
115-
print(key + ": " + str(type))
116-
117-
attribute = ("parameter_" + key[6:], type)
118-
arff_dict['attributes'].append(attribute)
119-
100+
arff_dict['attributes'] = self.trace_attributes
120101
arff_dict['data'] = self.trace_content
121102
arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'
122103

@@ -145,7 +126,7 @@ def publish(self):
145126
file_elements['predictions'] = ("predictions.arff", predictions)
146127

147128
if self.trace_content is not None:
148-
trace_arff = arff.dumps(self._generate_trace_arff_dict(self.model))
129+
trace_arff = arff.dumps(self._generate_trace_arff_dict())
149130
file_elements['trace'] = ("trace.arff", trace_arff)
150131

151132
return_value = _perform_api_call("/run/", file_elements=file_elements)

tests/test_runs/test_run_functions.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,8 @@ def test_run_regression_on_classif_task(self):
4141

4242
clf = LinearRegression()
4343
task = openml.tasks.get_task(task_id)
44-
self.assertRaisesRegexp(AttributeError,
45-
"'LinearRegression' object has no attribute 'classes_'",
46-
openml.runs.run_task, task=task, model=clf)
44+
self.assertRaises(openml.exceptions.PyOpenMLError, openml.runs.run_task,
45+
task=task, model=clf, avoid_duplicate_runs=False)
4746

4847
@mock.patch('openml.flows.sklearn_to_flow')
4948
def test_check_erronous_sklearn_flow_fails(self, sklearn_to_flow_mock):
@@ -124,7 +123,7 @@ def test__run_task_get_arffcontent(self):
124123
clf, task, class_labels)
125124

126125
clf = SGDClassifier(loss='log', random_state=1)
127-
arff_datacontent, arff_tracecontent = openml.runs.functions._run_task_get_arffcontent(
126+
arff_datacontent, arff_tracecontent, _ = openml.runs.functions._run_task_get_arffcontent(
128127
clf, task, class_labels)
129128
# predictions
130129
self.assertIsInstance(arff_datacontent, list)
@@ -291,7 +290,7 @@ def test_run_on_dataset_with_missing_labels(self):
291290
model = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
292291
('Estimator', DecisionTreeClassifier())])
293292

294-
data_content, _ = _run_task_get_arffcontent(model, task, class_labels)
293+
data_content, _, _ = _run_task_get_arffcontent(model, task, class_labels)
295294
# 2 folds, 5 repeats; keep in mind that this task comes from the test
296295
# server, the task on the live server is different
297296
self.assertEqual(len(data_content), 4490)

0 commit comments

Comments
 (0)