added more metrics to local run evaluations

janvanrijn · janvanrijn · commit c9dfcde8dfdb · 2017-07-04T14:39:36.000+02:00
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -10,6 +10,7 @@
 import sklearn.pipeline
 import six
 import xmltodict
+import sklearn.metrics
 
 import openml
 import openml.utils
@@ -113,7 +114,6 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
     else:
         run.fold_evaluations = fold_evaluations
 
-
     config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
 
     return run
@@ -427,6 +427,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
 
                 ProbaY = model_fold.predict_proba(testX)
                 PredY = model_fold.predict(testX)
+
+                # add client-side calculated metrics. These might be used on the server as consistency check
+                def _calculate_local_measure(sklearn_fn, openml_name):
+                    user_defined_measures_fold[openml_name][rep_no][fold_no] = \
+                        sklearn_fn(testY, PredY)
+                    user_defined_measures_sample[openml_name][rep_no][fold_no][sample_no] = \
+                        sklearn_fn(testY, PredY)
+
+                _calculate_local_measure(sklearn.metrics.accuracy_score, 'predictive_accuracy')
+
                 if can_measure_runtime:
                     modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
                     user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
@@ -457,6 +467,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
            user_defined_measures_sample
 
 
+
 def _extract_arfftrace(model, rep_no, fold_no):
     if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
         raise ValueError('model should be instance of'\
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -166,22 +166,32 @@ def _check_fold_evaluations(self, fold_evaluations, num_repeats, num_folds, max_
         condition outside of this function. )
         default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
         '''
-        timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
+
+        # a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
+        check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
+                          'usercpu_time_millis_training': (0, max_time_allowed),  # should take at least one millisecond (?)
+                          'usercpu_time_millis': (0, max_time_allowed),
+                          'predictive_accuracy': (0, 1)}
 
         self.assertIsInstance(fold_evaluations, dict)
         if sys.version_info[:2] >= (3, 3):
-            self.assertEquals(set(fold_evaluations.keys()), timing_measures)
-            for measure in timing_measures:
+            # this only holds if we are allowed to record time (otherwise some are missing)
+            self.assertEquals(set(fold_evaluations.keys()), set(check_measures.keys()))
+
+        for measure in check_measures.keys():
+            if measure in fold_evaluations:
                 num_rep_entrees = len(fold_evaluations[measure])
                 self.assertEquals(num_rep_entrees, num_repeats)
+                min_val = check_measures[measure][0]
+                max_val = check_measures[measure][1]
                 for rep in range(num_rep_entrees):
                     num_fold_entrees = len(fold_evaluations[measure][rep])
                     self.assertEquals(num_fold_entrees, num_folds)
                     for fold in range(num_fold_entrees):
                         evaluation = fold_evaluations[measure][rep][fold]
                         self.assertIsInstance(evaluation, float)
-                        self.assertGreater(evaluation, 0) # should take at least one millisecond (?)
-                        self.assertLess(evaluation, max_time_allowed)
+                        self.assertGreaterEqual(evaluation, min_val)
+                        self.assertLessEqual(evaluation, max_val)
 
 
     def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds, num_samples, max_time_allowed=60000):
@@ -193,12 +203,20 @@ def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds,
         condition outside of this function. )
         default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
         '''
-        timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
+
+        # a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
+        check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
+                          'usercpu_time_millis_training': (0, max_time_allowed),  # should take at least one millisecond (?)
+                          'usercpu_time_millis': (0, max_time_allowed),
+                          'predictive_accuracy': (0, 1)}
 
         self.assertIsInstance(sample_evaluations, dict)
         if sys.version_info[:2] >= (3, 3):
-            self.assertEquals(set(sample_evaluations.keys()), timing_measures)
-            for measure in timing_measures:
+            # this only holds if we are allowed to record time (otherwise some are missing)
+            self.assertEquals(set(sample_evaluations.keys()), set(check_measures.keys()))
+
+        for measure in check_measures.keys():
+            if measure in sample_evaluations:
                 num_rep_entrees = len(sample_evaluations[measure])
                 self.assertEquals(num_rep_entrees, num_repeats)
                 for rep in range(num_rep_entrees):