requested changes for pullrequest #213

janvanrijn · janvanrijn · commit 9ec141c28bf0 · 2017-03-27T10:58:47.000+02:00
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -6,9 +6,9 @@ class OpenMLDataFeature(object):
        ----------
        index : int
             The index of this feature
-        name : string
+        name : str
             Name of the feature
-        data_type : string
+        data_type : str
             can be nominal, numeric, string, date (corresponds to arff)
         nominal_values : list(str)
             list of the possible values, in case of nominal attribute
@@ -17,11 +17,14 @@ class OpenMLDataFeature(object):
     LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
 
     def __init__(self, index, name, data_type, nominal_values, number_missing_values):
-        assert type(index) is int, "Index is of wrong datatype"
-        assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
-        if nominal_values is not None:
-            assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
-        assert type(number_missing_values) is int, "number_missing_values is of wrong datatype"
+        if type(index) != int:
+            raise ValueError('Index is of wrong datatype')
+        if data_type not in self.LEGAL_DATA_TYPES:
+            raise ValueError('data type should be in %s, found: %s' %(str(self.LEGAL_DATA_TYPES),data_type))
+        if nominal_values is not None and type(nominal_values) != list:
+            raise ValueError('Nominal_values is of wrong datatype')
+        if type(number_missing_values) != int:
+            raise ValueError('number_missing_values is of wrong datatype')
 
         self.index = index
         self.name = str(name)
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -69,6 +69,10 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
             self.ignore_attributes = [ignore_attribute]
         elif isinstance(ignore_attribute, list):
             self.ignore_attributes = ignore_attribute
+        elif ignore_attribute is None:
+            pass
+        else:
+            raise ValueError('wrong data type for ignore_attribute. Should be list. ')
         self.version_label = version_label
         self.citation = citation
         self.tag = tag
@@ -88,7 +92,8 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
                                             xmlfeature['oml:data_type'],
                                             None, #todo add nominal values (currently not in database)
                                             int(xmlfeature['oml:number_of_missing_values']))
-                assert idx == feature.index, "Data features not provided in right order"
+                if idx != feature.index:
+                    raise ValueError('Data features not provided in right order')
                 self.features[feature.index] = feature
 
 
@@ -313,7 +318,21 @@ def retrieve_class_labels(self, target_name='class'):
             return None
 
 
-    def get_features_by_type(self, data_type, exclude=None, exclude_ignore_attributes=True, exclude_row_id_attribute=True):
+    def get_features_by_type(self, data_type, exclude=None,
+                             exclude_ignore_attributes=True, exclude_row_id_attribute=True):
+        '''
+        Returns indices of features of a given type, e.g., all nominal features.
+        Can use additional parameters to exclude various features by index or ontology.
+
+        :param data_type: The data type to return (e.g., nominal, numeric, date, string)
+        :param exclude: Indices to exclude (and adapt the return values as if these indices
+                        are not present)
+        :param exclude_ignore_attributes: Whether to exclude the defined ignore attributes
+                        (and adapt the return values as if these indices are not present)
+        :param exclude_row_id_attribute:Whether to exclude the defined row id attributes
+                        (and adapt the return values as if these indices are not present)
+        :return: a list of indices that have the specified data type
+        '''
         assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
         if self.ignore_attributes is not None:
             assert type(self.ignore_attributes) is list, "ignore_attributes should be a list"
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -74,8 +74,10 @@ def run_task(task, model):
     return run
 
 
-def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label, predicted_probabilities, class_labels, model_classes_mapping):
-    """Complicated util function that turns probability estimates of a classifier for a given instance into the right arff format to upload to openml.
+def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
+                       predicted_probabilities, class_labels, model_classes_mapping):
+    """Util function that turns probability estimates of a classifier for a given
+        instance into the right arff format to upload to openml.
 
         Parameters
         ----------
@@ -90,6 +92,9 @@ def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
         predicted_probabilities : array (size=num_classes)
             probabilities per class
         class_labels : array (size=num_classes)
+        model_classes_mapping : list
+            A list of classes the model produced.
+            Obtained by BaseEstimator.classes_
 
         Returns
         -------
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
@@ -156,6 +156,8 @@ def test_publish_flow(self):
 
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
+        # should not throw error as it contains two differentiable forms of Bagging
+        # i.e., Bagging(Bagging(J48)) and Bagging(J48)
         sentinel = get_sentinel()
         semi_legal = sklearn.ensemble.BaggingClassifier(
             base_estimator=sklearn.ensemble.BaggingClassifier(
@@ -166,6 +168,7 @@ def test_semi_legal_flow(self):
         flow.publish()
 
     def test_illegal_flow(self):
+        # should throw error as it contains two imputers
         illegal = sklearn.pipeline.Pipeline(steps=[('imputer1', sklearn.preprocessing.Imputer()),
                                                    ('imputer2', sklearn.preprocessing.Imputer()),
                                                    ('classif', sklearn.tree.DecisionTreeClassifier())])
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -278,7 +278,7 @@ def test_get_runs_list_by_tag(self):
         self.assertGreaterEqual(len(runs), 1)
 
     def test_run_on_dataset_with_missing_labels(self):
-        from openml.runs.functions import _prediction_to_row
+        from openml.runs.functions import _run_task_get_arffcontent
         from sklearn.tree import DecisionTreeClassifier
         from sklearn.preprocessing.imputation import Imputer
         task = openml.tasks.get_task(2)
@@ -287,37 +287,5 @@ def test_run_on_dataset_with_missing_labels(self):
         model = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                 ('Estimator', DecisionTreeClassifier())])
 
-        X, Y = task.get_X_and_y()
-        rep_no = 0
-        # TODO use different iterator to only provide a single iterator (less
-        # methods, less maintenance, less confusion)
-        for rep in task.iterate_repeats():
-            fold_no = 0
-            for fold in rep:
-                train_indices, test_indices = fold
-                trainX = X[train_indices]
-                trainY = Y[train_indices]
-                testX = X[test_indices]
-                testY = Y[test_indices]
-
-                model.fit(trainX, trainY)
-
-                ProbaY = model.predict_proba(testX)
-                PredY = model.predict(testX)
-
-                missing_label_idx = [3]
-
-                for i in range(0, len(test_indices)):
-                    arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i],
-                                                   ProbaY[i], class_labels, model.classes_)
-
-                    offset = 0
-                    for idx, proba in enumerate(arff_line[3:-2]):
-                        if idx in missing_label_idx:
-                            offset += 1
-                        else:
-                            assert proba == ProbaY[i][idx-offset]
-
-                fold_no = fold_no + 1
-            rep_no = rep_no + 1
+        _run_task_get_arffcontent(model, task, class_labels)