added possibility to obtain scikit-learn scores from the predictions arff

janvanrijn · janvanrijn · commit 1c285a803b58 · 2017-07-04T16:27:39.000+02:00
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -1,4 +1,4 @@
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 import json
 import sys
 import time
@@ -8,7 +8,7 @@
 
 import openml
 from ..tasks import get_task
-from .._api_calls import _perform_api_call
+from .._api_calls import _perform_api_call, _file_id_to_url, _read_url_files
 from ..exceptions import PyOpenMLError
 
 class OpenMLRun(object):
@@ -106,6 +106,83 @@ def _generate_trace_arff_dict(self):
 
         return arff_dict
 
+    def get_metric_score(self, sklearn_fn, kwargs={}):
+        '''Calculates metric scores based on predicted values. Assumes the
+        run has been executed locally (and contans run_data). Furthermore,
+        it assumes that the 'correct' field has been set (which is
+        automatically the case for local runs)
+
+        Parameters
+        -------
+        sklearn_fn : function
+            a function pointer to a sklearn function that
+            accepts y_true, y_pred and *kwargs
+
+        Returns
+        -------
+        scores : list
+            a list of floats, of length num_folds * num_repeats
+        '''
+        if self.data_content is not None:
+            predictions_arff = self._generate_arff_dict()
+        elif 'predictions' in self.output_files:
+            raise ValueError('Not Implemented Yet: Function can currently only be used on locally executed runs (contributor needed!)')
+        else:
+            raise ValueError('Run should have been locally executed.')
+
+        def _attribute_list_to_dict(attribute_list):
+            # convenience function
+            res = dict()
+            for idx in range(len(attribute_list)):
+                res[attribute_list[idx][0]] = idx
+            return res
+        attribute_dict = _attribute_list_to_dict(predictions_arff['attributes'])
+
+        # might throw KeyError!
+        predicted_idx = attribute_dict['prediction']
+        correct_idx = attribute_dict['correct']
+        repeat_idx = attribute_dict['repeat']
+        fold_idx = attribute_dict['fold']
+        sample_idx = attribute_dict['sample'] # TODO: this one might be zero
+
+        if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]:
+            pred = predictions_arff['attributes'][predicted_idx][1]
+            corr = predictions_arff['attributes'][correct_idx][1]
+            raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr)))
+
+        # TODO: these could be cached
+        values_predict = {}
+        values_correct = {}
+        for line_idx, line in enumerate(predictions_arff['data']):
+            rep = line[repeat_idx]
+            fold = line[fold_idx]
+            samp = line[sample_idx]
+
+            # TODO: can be sped up bt preprocessing index, but OK for now.
+            prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx])
+            correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx])
+            if rep not in values_predict:
+                values_predict[rep] = dict()
+                values_correct[rep] = dict()
+            if fold not in values_predict[rep]:
+                values_predict[rep][fold] = dict()
+                values_correct[rep][fold] = dict()
+            if samp not in values_predict[rep][fold]:
+                values_predict[rep][fold][samp] = []
+                values_correct[rep][fold][samp] = []
+
+            values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction)
+            values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct)
+
+        scores = []
+        for rep in values_predict.keys():
+            for fold in values_predict[rep].keys():
+                last_sample = len(values_predict[rep][fold]) - 1
+                y_pred = values_predict[rep][fold][last_sample]
+                y_true = values_correct[rep][fold][last_sample]
+                scores.append(sklearn_fn(y_true, y_pred, **kwargs))
+        return scores
+
     def publish(self):
         """Publish a run to the OpenML server.
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -327,6 +327,16 @@ def test_run_and_upload(self):
         for clf, rsv in zip(clfs, random_state_fixtures):
             run = self._perform_run(task_id, num_test_instances, clf,
                                     random_state_value=rsv)
+
+            # obtain accuracy scores using get_metric_score:
+            accuracy_scores = run.get_metric_score(sklearn.metrics.accuracy_score)
+            # compare with the scores in user defined measures
+            accuracy_scores_provided = []
+            for rep in run.fold_evaluations['predictive_accuracy'].keys():
+                for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
+                    accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
+            self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
+
             if isinstance(clf, BaseSearchCV):
                 if isinstance(clf, GridSearchCV):
                     grid_iterations = 1
@@ -403,6 +413,40 @@ def test_initialize_cv_from_run(self):
         self.assertEquals(modelS.cv.random_state, 62501)
         self.assertEqual(modelR.cv.random_state, 62501)
 
+    def test_get_run_metric_score(self):
+
+        # construct sci-kit learn classifier
+        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), ('estimator', RandomForestClassifier())])
+
+
+        # download task
+        task = openml.tasks.get_task(7)
+
+        # invoke OpenML run
+        run = openml.runs.run_model_on_task(task, clf)
+
+        # compare with the scores in user defined measures
+        accuracy_scores_provided = []
+        for rep in run.fold_evaluations['predictive_accuracy'].keys():
+            for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
+                accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
+        accuracy_scores = run.get_metric_score(sklearn.metrics.accuracy_score)
+        self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
+
+        # also check if we can obtain some other scores: # TODO: how to do AUC?
+        tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
+                 (sklearn.metrics.auc, {}),
+                 (sklearn.metrics.average_precision_score, {}),
+                 (sklearn.metrics.jaccard_similarity_score, {}),
+                 (sklearn.metrics.precision_score, {'average': 'macro'}),
+                 (sklearn.metrics.brier_score_loss, {})]
+        for test_idx, test in enumerate(tests):
+            alt_scores = run.get_metric_score(test[0], test[1])
+            self.assertEquals(len(alt_scores), 10)
+            for idx in range(len(alt_scores)):
+                self.assertGreaterEqual(alt_scores[idx], 0)
+                self.assertLessEqual(alt_scores[idx], 1)
+
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                                ('VarianceThreshold', VarianceThreshold(threshold=0.05)),