Skip to content

Commit 1c285a8

Browse files
committed
added possibility to obtain scikit-learn scores from the predictions arff
1 parent c9dfcde commit 1c285a8

2 files changed

Lines changed: 123 additions & 2 deletions

File tree

openml/runs/run.py

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from collections import OrderedDict
1+
from collections import OrderedDict, defaultdict
22
import json
33
import sys
44
import time
@@ -8,7 +8,7 @@
88

99
import openml
1010
from ..tasks import get_task
11-
from .._api_calls import _perform_api_call
11+
from .._api_calls import _perform_api_call, _file_id_to_url, _read_url_files
1212
from ..exceptions import PyOpenMLError
1313

1414
class OpenMLRun(object):
@@ -106,6 +106,83 @@ def _generate_trace_arff_dict(self):
106106

107107
return arff_dict
108108

109+
def get_metric_score(self, sklearn_fn, kwargs={}):
110+
'''Calculates metric scores based on predicted values. Assumes the
111+
run has been executed locally (and contans run_data). Furthermore,
112+
it assumes that the 'correct' field has been set (which is
113+
automatically the case for local runs)
114+
115+
Parameters
116+
-------
117+
sklearn_fn : function
118+
a function pointer to a sklearn function that
119+
accepts y_true, y_pred and *kwargs
120+
121+
Returns
122+
-------
123+
scores : list
124+
a list of floats, of length num_folds * num_repeats
125+
'''
126+
if self.data_content is not None:
127+
predictions_arff = self._generate_arff_dict()
128+
elif 'predictions' in self.output_files:
129+
raise ValueError('Not Implemented Yet: Function can currently only be used on locally executed runs (contributor needed!)')
130+
else:
131+
raise ValueError('Run should have been locally executed.')
132+
133+
def _attribute_list_to_dict(attribute_list):
134+
# convenience function
135+
res = dict()
136+
for idx in range(len(attribute_list)):
137+
res[attribute_list[idx][0]] = idx
138+
return res
139+
attribute_dict = _attribute_list_to_dict(predictions_arff['attributes'])
140+
141+
# might throw KeyError!
142+
predicted_idx = attribute_dict['prediction']
143+
correct_idx = attribute_dict['correct']
144+
repeat_idx = attribute_dict['repeat']
145+
fold_idx = attribute_dict['fold']
146+
sample_idx = attribute_dict['sample'] # TODO: this one might be zero
147+
148+
if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]:
149+
pred = predictions_arff['attributes'][predicted_idx][1]
150+
corr = predictions_arff['attributes'][correct_idx][1]
151+
raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr)))
152+
153+
# TODO: these could be cached
154+
values_predict = {}
155+
values_correct = {}
156+
for line_idx, line in enumerate(predictions_arff['data']):
157+
rep = line[repeat_idx]
158+
fold = line[fold_idx]
159+
samp = line[sample_idx]
160+
161+
# TODO: can be sped up bt preprocessing index, but OK for now.
162+
prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx])
163+
correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx])
164+
if rep not in values_predict:
165+
values_predict[rep] = dict()
166+
values_correct[rep] = dict()
167+
if fold not in values_predict[rep]:
168+
values_predict[rep][fold] = dict()
169+
values_correct[rep][fold] = dict()
170+
if samp not in values_predict[rep][fold]:
171+
values_predict[rep][fold][samp] = []
172+
values_correct[rep][fold][samp] = []
173+
174+
values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction)
175+
values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct)
176+
177+
scores = []
178+
for rep in values_predict.keys():
179+
for fold in values_predict[rep].keys():
180+
last_sample = len(values_predict[rep][fold]) - 1
181+
y_pred = values_predict[rep][fold][last_sample]
182+
y_true = values_correct[rep][fold][last_sample]
183+
scores.append(sklearn_fn(y_true, y_pred, **kwargs))
184+
return scores
185+
109186
def publish(self):
110187
"""Publish a run to the OpenML server.
111188

tests/test_runs/test_run_functions.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,16 @@ def test_run_and_upload(self):
327327
for clf, rsv in zip(clfs, random_state_fixtures):
328328
run = self._perform_run(task_id, num_test_instances, clf,
329329
random_state_value=rsv)
330+
331+
# obtain accuracy scores using get_metric_score:
332+
accuracy_scores = run.get_metric_score(sklearn.metrics.accuracy_score)
333+
# compare with the scores in user defined measures
334+
accuracy_scores_provided = []
335+
for rep in run.fold_evaluations['predictive_accuracy'].keys():
336+
for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
337+
accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
338+
self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
339+
330340
if isinstance(clf, BaseSearchCV):
331341
if isinstance(clf, GridSearchCV):
332342
grid_iterations = 1
@@ -403,6 +413,40 @@ def test_initialize_cv_from_run(self):
403413
self.assertEquals(modelS.cv.random_state, 62501)
404414
self.assertEqual(modelR.cv.random_state, 62501)
405415

416+
def test_get_run_metric_score(self):
417+
418+
# construct sci-kit learn classifier
419+
clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), ('estimator', RandomForestClassifier())])
420+
421+
422+
# download task
423+
task = openml.tasks.get_task(7)
424+
425+
# invoke OpenML run
426+
run = openml.runs.run_model_on_task(task, clf)
427+
428+
# compare with the scores in user defined measures
429+
accuracy_scores_provided = []
430+
for rep in run.fold_evaluations['predictive_accuracy'].keys():
431+
for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
432+
accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
433+
accuracy_scores = run.get_metric_score(sklearn.metrics.accuracy_score)
434+
self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
435+
436+
# also check if we can obtain some other scores: # TODO: how to do AUC?
437+
tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
438+
(sklearn.metrics.auc, {}),
439+
(sklearn.metrics.average_precision_score, {}),
440+
(sklearn.metrics.jaccard_similarity_score, {}),
441+
(sklearn.metrics.precision_score, {'average': 'macro'}),
442+
(sklearn.metrics.brier_score_loss, {})]
443+
for test_idx, test in enumerate(tests):
444+
alt_scores = run.get_metric_score(test[0], test[1])
445+
self.assertEquals(len(alt_scores), 10)
446+
for idx in range(len(alt_scores)):
447+
self.assertGreaterEqual(alt_scores[idx], 0)
448+
self.assertLessEqual(alt_scores[idx], 1)
449+
406450
def test_initialize_model_from_run(self):
407451
clf = sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='median')),
408452
('VarianceThreshold', VarianceThreshold(threshold=0.05)),

0 commit comments

Comments
 (0)