(API) Added task iterator to be able to iterate over repeats and folds separately.

mfeurer · zardaloop · commit 13f16b38a37f · 2016-02-14T12:28:19.000Z
diff --git a/openml/entities/split.py b/openml/entities/split.py
@@ -1,4 +1,4 @@
-from collections import namedtuple
+from collections import namedtuple, OrderedDict
 import os
 import sys
 if sys.version_info[0] > 3:
@@ -20,13 +20,19 @@ def __init__(self, name, description, split):
         self.description = description
         self.name = name
         self.split = dict()
-
+        
         # Add splits according to repetition
         for repetition in split:
             repetition = int(repetition)
-            self.split[repetition] = dict()
+            self.split[repetition] = OrderedDict()
             for fold in split[repetition]:
                 self.split[repetition][fold] = split[repetition][fold]
+                
+        self.repeats = len(self.split)
+        if any([len(self.split[0]) != len(self.split[i]) 
+                for i in range(self.repeats)]):
+            raise ValueError('')
+        self.folds = len(self.split[0])
 
     def __eq__(self, other):
         if type(self) != type(other):
@@ -62,17 +68,20 @@ def from_arff_file(cls, filename, cache=True):
                 repetitions = _["repetitions"]
                 name = _["name"]
 
+        # Cache miss
         if repetitions is None:
+            # Faster than liac-arff and sufficient in this situation!
             splits, meta = scipy.io.arff.loadarff(filename)
             name = meta.name
 
-            repetitions = dict()
+            repetitions = OrderedDict()
             for line in splits:
+                # A line looks like type, rowid, repeat, fold
                 repetition = int(line[2])
                 fold = int(line[3])
 
                 if repetition not in repetitions:
-                    repetitions[repetition] = dict()
+                    repetitions[repetition] = OrderedDict()
                 if fold not in repetitions[repetition]:
                     repetitions[repetition][fold] = ([], [])
 
@@ -98,11 +107,16 @@ def from_arff_file(cls, filename, cache=True):
         return cls(name, '', repetitions)
 
     def from_dataset(self, X, Y, folds, repeats):
-        pass
+        raise NotImplementedError()
 
-    def get(self, fold=0, repeat=0):
+    def get(self, repeat=0, fold=0):
         if repeat not in self.split:
             raise ValueError("Repeat %s not known" % str(repeat))
         if fold not in self.split[repeat]:
             raise ValueError("Fold %s not known" % str(fold))
-        return self.split[repeat][fold]
+        return self.split[repeat][fold]
+        
+    def iterate_splits(self):
+        for rep in range(self.repeats):
+            yield (self.get(repeat=rep, fold=fold) for fold in range(self.folds))
+                 
diff --git a/openml/entities/task.py b/openml/entities/task.py
@@ -17,11 +17,8 @@ def __init__(self, task_id, task_type, data_set_id, target_feature,
         self.task_type = task_type
         self.dataset_id = int(data_set_id)
         self.target_feature = target_feature
-        # TODO: this can become its own class if necessary
         self.estimation_procedure = dict()
         self.estimation_procedure["type"] = estimation_procedure_type
-        # TODO: ideally this has the indices for the different splits...but
-        # the evaluation procedure 3foldtest/10foldvalid is not available
         self.estimation_procedure["data_splits_url"] = data_splits_url
         self.estimation_procedure["parameters"] = estimation_parameters
         #
@@ -71,100 +68,14 @@ def get_train_test_split_indices(self, fold=0, repeat=0):
         train_indices, test_indices = split.get(repeat=repeat, fold=fold)
         return train_indices, test_indices
 
-    def get_train_and_test_set(self, fold=0, repeat=0):
-        X, Y = self.get_X_and_Y()
-        train_indices, test_indices = self.get_train_test_split_indices(
-            fold=fold, repeat=repeat)
-        return X[train_indices], Y[train_indices], X[test_indices], Y[test_indices]
-
-    """
-    def get_validation_split(self, fold):
-        ""This is not part of the OpenML specification!
-        ""
-        split = OpenMLSplit.from_arff_file(
-            self.estimation_procedure["local_validation_split_file"])
-
-        if len(split.split.keys()) != 1:
-            raise NotImplementedError("Repeats are not implemented yet...")
-
-        # TODO: write a test that always a subset of the train/test split is
-        # returned
-        vtrain_indices, validation_indices = split.split[0][fold]
-        train_indices, test_indices = self.get_train_test_split()
-
-        return train_indices[vtrain_indices], train_indices[validation_indices]
-
-    def get_CV_fold(self, X, Y, fold, folds, shuffle=True):
-        ""This is not part of the OpenML specification
-        ""
-        fold = int(fold)
-        folds = int(folds)
-        if fold >= folds:
-            raise ValueError((fold, folds))
-        if X.shape[0] != Y.shape[0]:
-            raise ValueError("The first dimension of the X and Y array must "
-                             "be equal.")
-
-        if shuffle == True:
-            rs = np.random.RandomState(42)
-            indices = np.arange(X.shape[0])
-            rs.shuffle(indices)
-            Y = Y[indices]
-
-        kf = StratifiedKFold(Y, n_folds=folds, indices=True)
-        for idx, split in enumerate(kf):
-            if idx == fold:
-                break
-
-        if shuffle == True:
-            return indices[split[0]], indices[split[1]]
-        return split
-    """
-
-    """
-    def perform_cv_fold(self, algo, fold, folds):
-        ""Allows the user to perform cross validation for hyperparameter
-        optimization on the training data.""
-        # TODO: this is only done for hyperparameter optimization and is not
-        # part of the OpenML specification. The OpenML specification would
-        # like to have the hyperparameter evaluation inside the evaluate
-        # performed by the target algorithm itself. Hyperparameter
-        # optimization on the other hand needs these both things to be decoupled
-        # For being closer to OpenML one could also call evaluate and pass
-        # everything else through kwargs.
-        if self.task_type != "Supervised Classification":
-            raise NotImplementedError(self.task_type)
-
-        print("Procedure", self.estimation_procedure)
-        print("Type", self.estimation_procedure["type"])
-        # TODO fix Task generation!
-        # if self.estimation_procedure["type"] not in ["holdout",
-        # "customholdout"]:
-        #    raise NotImplementedError(self.estimation_procedure["type"])
-
-        #if self.estimation_procedure["parameters"]["stratified_sampling"] != \
-        #        'true':
-        #    raise NotImplementedError(
-        #        self.estimation_procedure["parameters"]["stratified_sampling"])
-
-        #if self.evaluation_measure not in ["predictive accuracy",
-        #                                   "predictive_accuracy"]:
-        #    raise NotImplementedError(self.evaluation_measure)
-
-        # #######################################################################
-        # Test folds
-        train_indices, test_indices = self.get_train_test_split()
-
-        ########################################################################
-        # Crossvalidation folds
-        train_indices, validation_indices = self.get_validation_split(fold)
-
-        X, Y = self.get_dataset()
-
-        algo.fit(X[train_indices], Y[train_indices])
+    def iterate_repeats(self):
+        split = self.api_connector.download_split(self)
+        for rep in split.iterate_splits():
+            yield rep
 
-        predictions = algo.predict(X[validation_indices])
-        accuracy = sklearn.metrics.accuracy_score(Y[validation_indices],
-                                                  predictions)
-        return accuracy
-    """
+    def iterate_all_splits(self):
+        split = self.api_connector.download_split(self)
+        for rep in split.iterate_splits():
+            for fold in rep:
+                yield fold
+