updated to work with unit tests (+ small bugfixes)

janvanrijn · janvanrijn · commit edbad394a20f · 2017-06-13T22:05:36.000+02:00
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -594,7 +594,8 @@ def _create_run_from_xml(xml):
 
     files = dict()
     evaluations = dict()
-    detailed_evaluations = defaultdict(lambda: defaultdict(dict))
+    fold_evaluations = defaultdict(lambda: defaultdict(dict))
+    sample_evaluations = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
     if 'oml:output_data' not in run:
         raise ValueError('Run does not contain output_data (OpenML server error?)')
     else:
@@ -621,11 +622,18 @@ def _create_run_from_xml(xml):
                 else:
                     raise ValueError('Could not find keys "value" or "array_data" '
                                      'in %s' % str(evaluation_dict.keys()))
-
-                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
+                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict and '@sample' in evaluation_dict:
+                    repeat = int(evaluation_dict['@repeat'])
+                    fold = int(evaluation_dict['@fold'])
+                    sample = int(evaluation_dict['@sample'])
+                    repeat_dict = sample_evaluations[key]
+                    fold_dict = repeat_dict[repeat]
+                    sample_dict = fold_dict[fold]
+                    sample_dict[sample] = value
+                elif '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
                     repeat = int(evaluation_dict['@repeat'])
                     fold = int(evaluation_dict['@fold'])
-                    repeat_dict = detailed_evaluations[key]
+                    repeat_dict = fold_evaluations[key]
                     fold_dict = repeat_dict[repeat]
                     fold_dict[fold] = value
                 else:
@@ -652,7 +660,9 @@ def _create_run_from_xml(xml):
                      parameter_settings=parameters,
                      dataset_id=dataset_id, output_files=files,
                      evaluations=evaluations,
-                     detailed_evaluations=detailed_evaluations, tags=tags)
+                     fold_evaluations=fold_evaluations,
+                     sample_evaluations=sample_evaluations,
+                     tags=tags)
 
 def _create_trace_from_description(xml):
     result_dict = xmltodict.parse(xml)['oml:trace']
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
@@ -48,12 +48,13 @@ def __eq__(self, other):
                     return False
                 else:
                     for fold in self.split[repetition]:
-                        if np.all(self.split[repetition][fold].test !=
-                                  other.split[repetition][fold].test)\
-                                and \
-                                np.all(self.split[repetition][fold].train
-                                       != other.split[repetition][fold].train):
-                            return False
+                        for sample in self.split[repetition][fold]:
+                            if np.all(self.split[repetition][fold][sample].test !=
+                                      other.split[repetition][fold][sample].test)\
+                                    and \
+                                    np.all(self.split[repetition][fold][sample].train
+                                           != other.split[repetition][fold][sample].train):
+                                return False
         return True
 
     @classmethod
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -158,7 +158,7 @@ def _remove_random_state(flow):
         return run
 
 
-    def _check_detailed_evaluations(self, detailed_evaluations, num_repeats, num_folds, max_time_allowed=60000):
+    def _check_fold_evaluations(self, fold_evaluations, num_repeats, num_folds, max_time_allowed=60000):
         '''
         Checks whether the right timing measures are attached to the run (before upload).
         Test is only performed for versions >= Python3.3
@@ -169,17 +169,17 @@ def _check_detailed_evaluations(self, detailed_evaluations, num_repeats, num_fol
         '''
         timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
 
-        self.assertIsInstance(detailed_evaluations, dict)
+        self.assertIsInstance(fold_evaluations, dict)
         if sys.version_info[:2] >= (3, 3):
-            self.assertEquals(set(detailed_evaluations.keys()), timing_measures)
+            self.assertEquals(set(fold_evaluations.keys()), timing_measures)
             for measure in timing_measures:
-                num_rep_entrees = len(detailed_evaluations[measure])
+                num_rep_entrees = len(fold_evaluations[measure])
                 self.assertEquals(num_rep_entrees, num_repeats)
                 for rep in range(num_rep_entrees):
-                    num_fold_entrees = len(detailed_evaluations[measure][rep])
+                    num_fold_entrees = len(fold_evaluations[measure][rep])
                     self.assertEquals(num_fold_entrees, num_folds)
                     for fold in range(num_fold_entrees):
-                        evaluation = detailed_evaluations[measure][rep][fold]
+                        evaluation = fold_evaluations[measure][rep][fold]
                         self.assertIsInstance(evaluation, float)
                         self.assertGreater(evaluation, 0) # should take at least one millisecond (?)
                         self.assertLess(evaluation, max_time_allowed)
@@ -292,7 +292,7 @@ def test_run_and_upload(self):
                 self.assertTrue(check_res)
 
             # todo: check if runtime is present
-            self._check_detailed_evaluations(run.detailed_evaluations, 1, num_folds)
+            self._check_fold_evaluations(run.fold_evaluations, 1, num_folds)
             pass
 
     def test_initialize_cv_from_run(self):
@@ -523,18 +523,20 @@ def test__prediction_to_row(self):
 
         probaY = clf.predict_proba(test_X)
         predY = clf.predict(test_X)
+        sample_nr = 0 # default for this task
         for idx in range(0, len(test_X)):
-            arff_line = _prediction_to_row(repeat_nr, fold_nr, idx,
+            arff_line = _prediction_to_row(repeat_nr, fold_nr, sample_nr, idx,
                                            task.class_labels[test_y[idx]],
                                            predY[idx], probaY[idx], task.class_labels, clf.classes_)
 
             self.assertIsInstance(arff_line, list)
-            self.assertEqual(len(arff_line), 5 + len(task.class_labels))
+            self.assertEqual(len(arff_line), 6 + len(task.class_labels))
             self.assertEqual(arff_line[0], repeat_nr)
             self.assertEqual(arff_line[1], fold_nr)
-            self.assertEqual(arff_line[2], idx)
+            self.assertEqual(arff_line[2], sample_nr)
+            self.assertEqual(arff_line[3], idx)
             sum = 0.0
-            for att_idx in range(3, 3 + len(task.class_labels)):
+            for att_idx in range(4, 4 + len(task.class_labels)):
                 self.assertIsInstance(arff_line[att_idx], float)
                 self.assertGreaterEqual(arff_line[att_idx], 0.0)
                 self.assertLessEqual(arff_line[att_idx], 1.0)
@@ -572,19 +574,19 @@ def test__run_task_get_arffcontent(self):
 
         clf = SGDClassifier(loss='log', random_state=1)
         res = openml.runs.functions._run_task_get_arffcontent(clf, task, class_labels)
-        arff_datacontent, arff_tracecontent, _, detailed_evaluations = res
+        arff_datacontent, arff_tracecontent, _, fold_evaluations, sample_evaluations = res
         # predictions
         self.assertIsInstance(arff_datacontent, list)
         # trace. SGD does not produce any
         self.assertIsInstance(arff_tracecontent, type(None))
 
-        self._check_detailed_evaluations(detailed_evaluations, num_repeats, num_folds)
+        self._check_fold_evaluations(fold_evaluations, num_repeats, num_folds)
 
         # 10 times 10 fold CV of 150 samples
         self.assertEqual(len(arff_datacontent), num_instances * num_repeats)
         for arff_line in arff_datacontent:
             # check number columns
-            self.assertEqual(len(arff_line), 7)
+            self.assertEqual(len(arff_line), 8)
             # check repeat
             self.assertGreaterEqual(arff_line[0], 0)
             self.assertLessEqual(arff_line[0], num_repeats - 1)
@@ -595,9 +597,9 @@ def test__run_task_get_arffcontent(self):
             self.assertGreaterEqual(arff_line[2], 0)
             self.assertLessEqual(arff_line[2], num_instances - 1)
             # check confidences
-            self.assertAlmostEqual(sum(arff_line[3:5]), 1.0)
-            self.assertIn(arff_line[5], ['won', 'nowin'])
+            self.assertAlmostEqual(sum(arff_line[4:6]), 1.0)
             self.assertIn(arff_line[6], ['won', 'nowin'])
+            self.assertIn(arff_line[7], ['won', 'nowin'])
 
     def test_get_run(self):
         # this run is not available on test
@@ -615,7 +617,7 @@ def test_get_run(self):
                          (7, 0.666365),
                          (8, 0.56759),
                          (9, 0.64621)]:
-            self.assertEqual(run.detailed_evaluations['f_measure'][0][i], value)
+            self.assertEqual(run.fold_evaluations['f_measure'][0][i], value)
         assert('weka' in run.tags)
         assert('stacking' in run.tags)
 
@@ -742,11 +744,11 @@ def test_run_on_dataset_with_missing_labels(self):
         model = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                 ('Estimator', DecisionTreeClassifier())])
 
-        data_content, _, _, _ = _run_task_get_arffcontent(model, task, class_labels)
+        data_content, _, _, _, _ = _run_task_get_arffcontent(model, task, class_labels)
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
         self.assertEqual(len(data_content), 4490)
         for row in data_content:
             # repeat, fold, row_id, 6 confidences, prediction and correct label
-            self.assertEqual(len(row), 11)
+            self.assertEqual(len(row), 12)
 
diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py
@@ -46,18 +46,19 @@ def test_from_arff_file(self):
         split = OpenMLSplit._from_arff_file(self.arff_filename)
         self.assertIsInstance(split.split, dict)
         self.assertIsInstance(split.split[0], dict)
-        self.assertIsInstance(split.split[0][0][0], np.ndarray)
-        self.assertIsInstance(split.split[0][0].train, np.ndarray)
-        self.assertIsInstance(split.split[0][0].train, np.ndarray)
-        self.assertIsInstance(split.split[0][0][1], np.ndarray)
-        self.assertIsInstance(split.split[0][0].test, np.ndarray)
-        self.assertIsInstance(split.split[0][0].test, np.ndarray)
+        self.assertIsInstance(split.split[0][0], dict)
+        self.assertIsInstance(split.split[0][0][0][0], np.ndarray)
+        self.assertIsInstance(split.split[0][0][0].train, np.ndarray)
+        self.assertIsInstance(split.split[0][0][0].train, np.ndarray)
+        self.assertIsInstance(split.split[0][0][0][1], np.ndarray)
+        self.assertIsInstance(split.split[0][0][0].test, np.ndarray)
+        self.assertIsInstance(split.split[0][0][0].test, np.ndarray)
         for i in range(10):
             for j in range(10):
-                self.assertGreaterEqual(split.split[i][j].train.shape[0], 808)
-                self.assertGreaterEqual(split.split[i][j].test.shape[0], 89)
-                self.assertEqual(split.split[i][j].train.shape[0] +
-                                 split.split[i][j].test.shape[0], 898)
+                self.assertGreaterEqual(split.split[i][j][0].train.shape[0], 808)
+                self.assertGreaterEqual(split.split[i][j][0].test.shape[0], 89)
+                self.assertEqual(split.split[i][j][0].train.shape[0] +
+                                 split.split[i][j][0].test.shape[0], 898)
 
     def test_get_split(self):
         split = OpenMLSplit._from_arff_file(self.arff_filename)
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
@@ -62,26 +62,3 @@ def test_get_train_and_test_split_indices(self):
         self.assertRaisesRegexp(ValueError, "Repeat 10 not known",
                                 task.get_train_test_split_indices, 0, 10)
 
-    def test_iterate_repeats(self):
-        openml.config.set_cache_directory(self.static_cache_dir)
-        task = openml.tasks.get_task(1882)
-
-        num_repeats = 0
-        for rep in task.iterate_repeats():
-            num_repeats += 1
-            self.assertIsInstance(rep, types.GeneratorType)
-        self.assertEqual(num_repeats, 10)
-
-    def test_iterate_all_splits(self):
-        openml.config.set_cache_directory(self.static_cache_dir)
-        task = openml.tasks.get_task(1882)
-
-        num_splits = 0
-        for split in task.iterate_all_splits():
-            num_splits += 1
-            self.assertIsInstance(split[0], np.ndarray)
-            self.assertIsInstance(split[1], np.ndarray)
-            self.assertEqual(split[0].shape[0] + split[1].shape[0], 898)
-        self.assertEqual(num_splits, 100)
-
-