Skip to content

Commit a5cb405

Browse files
authored
Merge pull request #227 from openml/issue214
Issue214
2 parents 8f8ec4e + 456f919 commit a5cb405

4 files changed

Lines changed: 52 additions & 17 deletions

File tree

openml/runs/functions.py

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@
1616
from ..util import URLError, version_complies
1717
from ..tasks.functions import _create_task_from_xml
1818
from .._api_calls import _perform_api_call
19-
from .run import OpenMLRun
19+
from .run import OpenMLRun, _get_version_information
2020

2121

2222
# _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
2323
# circular imports
2424

2525

2626

27-
def run_task(task, model, avoid_duplicate_runs=True):
27+
def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
2828
"""Performs a CV run on the dataset of the given task, using the split.
2929
3030
Parameters
@@ -35,13 +35,16 @@ def run_task(task, model, avoid_duplicate_runs=True):
3535
a model which has a function fit(X,Y) and predict(X),
3636
all supervised estimators of scikit learn follow this definition of a model [1]
3737
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
38-
38+
flow_tags : list(str)
39+
a list of tags that the flow should have at creation
3940
4041
Returns
4142
-------
4243
run : OpenMLRun
4344
Result of the run.
4445
"""
46+
if flow_tags is not None and not isinstance(flow_tags, list):
47+
raise ValueError("flow_tags should be list")
4548
# TODO move this into its onwn module. While it somehow belongs here, it
4649
# adds quite a lot of functionality which is better suited in other places!
4750
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
@@ -66,8 +69,10 @@ def run_task(task, model, avoid_duplicate_runs=True):
6669
raise ValueError('The task has no class labels. This method currently '
6770
'only works for tasks with class labels.')
6871

72+
run_environment = _get_version_information()
73+
tags = ['openml-python', run_environment[1]]
6974
# execute the run
70-
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
75+
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model, tags=tags)
7176
run.data_content, run.trace_content, run.trace_attributes = _run_task_get_arffcontent(model, task, class_labels)
7277

7378
if flow_id == False:
@@ -176,18 +181,20 @@ def _run_task_get_arffcontent(model, task, class_labels):
176181
if version_complies(3, 3):
177182
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
178183
user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
179-
180-
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
181-
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
182-
model_classes = model_fold.best_estimator_.classes_
183-
else:
184-
model_classes = model_fold.classes_
185184
except AttributeError as e:
186185
# typically happens when training a regressor on classification task
187186
raise PyOpenMLError(str(e))
187+
188+
# extract trace
189+
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
190+
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
191+
model_classes = model_fold.best_estimator_.classes_
192+
else:
193+
model_classes = model_fold.classes_
188194

189195
if version_complies(3, 3):
190196
modelpredict_starttime = time.process_time()
197+
191198
ProbaY = model_fold.predict_proba(testX)
192199
PredY = model_fold.predict(testX)
193200
if version_complies(3, 3):
@@ -215,6 +222,12 @@ def _run_task_get_arffcontent(model, task, class_labels):
215222

216223

217224
def _extract_arfftrace(model, rep_no, fold_no):
225+
if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
226+
raise ValueError('model should be instance of'\
227+
' sklearn.model_selection._search.BaseSearchCV')
228+
if not hasattr(model, 'cv_results_'):
229+
raise ValueError('model should contain `cv_results_`')
230+
218231
arff_tracecontent = []
219232
for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
220233
# we use the string values for True and False, as it is defined in this way by the OpenML server
@@ -230,6 +243,12 @@ def _extract_arfftrace(model, rep_no, fold_no):
230243
return arff_tracecontent
231244

232245
def _extract_arfftrace_attributes(model):
246+
if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
247+
raise ValueError('model should be instance of'\
248+
' sklearn.model_selection._search.BaseSearchCV')
249+
if not hasattr(model, 'cv_results_'):
250+
raise ValueError('model should contain `cv_results_`')
251+
233252
# attributes that will be in trace arff, regardless of the model
234253
trace_attributes = [('repeat', 'NUMERIC'),
235254
('fold', 'NUMERIC'),
@@ -391,6 +410,15 @@ def _create_run_from_xml(xml):
391410
evaluation_flows[key] = flow_id
392411

393412
evaluation_flows[key] = flow_id
413+
tags = None
414+
if 'oml:tag' in run:
415+
if isinstance(run['oml:tag'], str):
416+
tags = [run['oml:tag']]
417+
elif isinstance(run['oml:tag'], list):
418+
tags = run['oml:tag']
419+
else:
420+
raise ValueError('Received not string and non list as tag item')
421+
394422

395423
return OpenMLRun(run_id=run_id, uploader=uploader,
396424
uploader_name=uploader_name, task_id=task_id,
@@ -401,7 +429,7 @@ def _create_run_from_xml(xml):
401429
parameter_settings=parameters,
402430
dataset_id=dataset_id, predictions_url=predictions_url,
403431
evaluations=evaluations,
404-
detailed_evaluations=detailed_evaluations)
432+
detailed_evaluations=detailed_evaluations, tags=tags)
405433

406434

407435
def _get_cached_run(run_id):

openml/runs/run.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
4848
self.flow = flow
4949
self.run_id = run_id
5050
self.model = model
51+
self.tags = tags
5152

5253
def _generate_arff_dict(self):
5354
"""Generates the arff dictionary for uploading predictions to the server.
@@ -142,7 +143,6 @@ def _create_description_xml(self):
142143
xml_string : string
143144
XML description of run.
144145
"""
145-
run_environment = _get_version_information()
146146

147147
# TODO: don't we have flow object in data structure? Use this one
148148
downloaded_flow = openml.flows.get_flow(self.flow_id)
@@ -155,13 +155,12 @@ def _create_description_xml(self):
155155
# ' ', '_').replace('/', '-').replace(':', '.')
156156
# tags = run_environment + [well_formatted_time] + ['run_task'] + \
157157
# [self.model.__module__ + "." + self.model.__class__.__name__]
158-
tags = ['openml-python', run_environment[1]]
159158
description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
160159
setup_string=_create_setup_string(self.model),
161160
parameter_settings=openml_param_settings,
162161
error_message=self.error_message,
163162
detailed_evaluations=self.detailed_evaluations,
164-
tags=tags)
163+
tags=self.tags)
165164
description_xml = xmltodict.unparse(description, pretty=True)
166165
return description_xml
167166

tests/test_flows/test_flow.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ def test_sklearn_to_upload_to_flow(self):
226226
estimator=model, param_distributions=parameter_grid, cv=cv)
227227
rs.fit(X, y)
228228
flow = openml.flows.sklearn_to_flow(rs)
229+
flow.tags.extend(['openml-python', 'unittest'])
229230

230231
# Add the sentinel to all name strings in all subflows. Adds it to
231232
# name to make it easier in the web gui to see that the flow is only
@@ -281,5 +282,6 @@ def test_sklearn_to_upload_to_flow(self):
281282
% sentinel
282283

283284
self.assertEqual(new_flow.name, fixture_name)
284-
285+
self.assertTrue('openml-python' in new_flow.tags)
286+
self.assertTrue('unittest' in new_flow.tags)
285287
new_flow.model.fit(X, y)

tests/test_runs/test_run_functions.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_run_regression_on_classif_task(self):
4141

4242
clf = LinearRegression()
4343
task = openml.tasks.get_task(task_id)
44-
self.assertRaises(openml.exceptions.PyOpenMLError, openml.runs.run_task,
44+
self.assertRaises(AttributeError, openml.runs.run_task,
4545
task=task, model=clf, avoid_duplicate_runs=False)
4646

4747
@mock.patch('openml.flows.sklearn_to_flow')
@@ -60,7 +60,10 @@ def test_run_diabetes(self):
6060
num_instances = 768
6161

6262
clf = LogisticRegression()
63-
self._perform_run(task_id,num_instances, clf)
63+
res = self._perform_run(task_id,num_instances, clf)
64+
65+
downloaded = openml.runs.get_run(res.run_id)
66+
assert('openml-python' in downloaded.tags)
6467

6568
def test_run_optimize_randomforest_iris(self):
6669
task_id = 115
@@ -80,6 +83,7 @@ def test_run_optimize_randomforest_iris(self):
8083
n_iter=num_iterations)
8184

8285
run = self._perform_run(task_id, num_instances, random_search)
86+
print(run.trace_content)
8387
self.assertEqual(len(run.trace_content), num_iterations * num_folds)
8488

8589
def test_run_optimize_bagging_iris(self):
@@ -166,6 +170,8 @@ def test_get_run(self):
166170
(8, 0.56759),
167171
(9, 0.64621)]:
168172
self.assertEqual(run.detailed_evaluations['f_measure'][0][i], value)
173+
assert('weka' in run.tags)
174+
assert('stacking' in run.tags)
169175

170176
def _check_run(self, run):
171177
self.assertIsInstance(run, dict)

0 commit comments

Comments
 (0)