Skip to content

Commit c9dfcde

Browse files
committed
added more metrics to local run evaluations
1 parent 059bad7 commit c9dfcde

2 files changed

Lines changed: 38 additions & 9 deletions

File tree

openml/runs/functions.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import sklearn.pipeline
1111
import six
1212
import xmltodict
13+
import sklearn.metrics
1314

1415
import openml
1516
import openml.utils
@@ -113,7 +114,6 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
113114
else:
114115
run.fold_evaluations = fold_evaluations
115116

116-
117117
config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
118118

119119
return run
@@ -427,6 +427,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
427427

428428
ProbaY = model_fold.predict_proba(testX)
429429
PredY = model_fold.predict(testX)
430+
431+
# add client-side calculated metrics. These might be used on the server as consistency check
432+
def _calculate_local_measure(sklearn_fn, openml_name):
433+
user_defined_measures_fold[openml_name][rep_no][fold_no] = \
434+
sklearn_fn(testY, PredY)
435+
user_defined_measures_sample[openml_name][rep_no][fold_no][sample_no] = \
436+
sklearn_fn(testY, PredY)
437+
438+
_calculate_local_measure(sklearn.metrics.accuracy_score, 'predictive_accuracy')
439+
430440
if can_measure_runtime:
431441
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
432442
user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
@@ -457,6 +467,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
457467
user_defined_measures_sample
458468

459469

470+
460471
def _extract_arfftrace(model, rep_no, fold_no):
461472
if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
462473
raise ValueError('model should be instance of'\

tests/test_runs/test_run_functions.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,22 +166,32 @@ def _check_fold_evaluations(self, fold_evaluations, num_repeats, num_folds, max_
166166
condition outside of this function. )
167167
default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
168168
'''
169-
timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
169+
170+
# a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
171+
check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
172+
'usercpu_time_millis_training': (0, max_time_allowed), # should take at least one millisecond (?)
173+
'usercpu_time_millis': (0, max_time_allowed),
174+
'predictive_accuracy': (0, 1)}
170175

171176
self.assertIsInstance(fold_evaluations, dict)
172177
if sys.version_info[:2] >= (3, 3):
173-
self.assertEquals(set(fold_evaluations.keys()), timing_measures)
174-
for measure in timing_measures:
178+
# this only holds if we are allowed to record time (otherwise some are missing)
179+
self.assertEquals(set(fold_evaluations.keys()), set(check_measures.keys()))
180+
181+
for measure in check_measures.keys():
182+
if measure in fold_evaluations:
175183
num_rep_entrees = len(fold_evaluations[measure])
176184
self.assertEquals(num_rep_entrees, num_repeats)
185+
min_val = check_measures[measure][0]
186+
max_val = check_measures[measure][1]
177187
for rep in range(num_rep_entrees):
178188
num_fold_entrees = len(fold_evaluations[measure][rep])
179189
self.assertEquals(num_fold_entrees, num_folds)
180190
for fold in range(num_fold_entrees):
181191
evaluation = fold_evaluations[measure][rep][fold]
182192
self.assertIsInstance(evaluation, float)
183-
self.assertGreater(evaluation, 0) # should take at least one millisecond (?)
184-
self.assertLess(evaluation, max_time_allowed)
193+
self.assertGreaterEqual(evaluation, min_val)
194+
self.assertLessEqual(evaluation, max_val)
185195

186196

187197
def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds, num_samples, max_time_allowed=60000):
@@ -193,12 +203,20 @@ def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds,
193203
condition outside of this function. )
194204
default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
195205
'''
196-
timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
206+
207+
# a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
208+
check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
209+
'usercpu_time_millis_training': (0, max_time_allowed), # should take at least one millisecond (?)
210+
'usercpu_time_millis': (0, max_time_allowed),
211+
'predictive_accuracy': (0, 1)}
197212

198213
self.assertIsInstance(sample_evaluations, dict)
199214
if sys.version_info[:2] >= (3, 3):
200-
self.assertEquals(set(sample_evaluations.keys()), timing_measures)
201-
for measure in timing_measures:
215+
# this only holds if we are allowed to record time (otherwise some are missing)
216+
self.assertEquals(set(sample_evaluations.keys()), set(check_measures.keys()))
217+
218+
for measure in check_measures.keys():
219+
if measure in sample_evaluations:
202220
num_rep_entrees = len(sample_evaluations[measure])
203221
self.assertEquals(num_rep_entrees, num_repeats)
204222
for rep in range(num_rep_entrees):

0 commit comments

Comments
 (0)