finished conditional imputer with tests

janvanrijn · janvanrijn · commit 7d24294ed9a9 · 2017-03-05T18:42:09.000+01:00
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -12,21 +12,24 @@ class OpenMLDataFeature(object):
             can be nominal, numeric, string, date (corresponds to arff)
         nominal_values : list(str)
             list of the possible values, in case of nominal attribute
+        number_missing_values : int
        """
     LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
 
-    def __init__(self, index, name, data_type, nominal_values):
+    def __init__(self, index, name, data_type, nominal_values, number_missing_values):
         assert type(index) is int, "Index is of wrong datatype"
         assert type(name) is str, "Name is of wrong datatype"
         assert type(data_type) is str, "Data_type is of wrong datatype"
         assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
         if nominal_values is not None:
             assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
+        assert type(number_missing_values) is int, "number_missing_values is of wrong datatype"
 
         self.index = index
         self.name = name
         self.data_type = data_type
         self.nominal_values = nominal_values
+        self.number_missing_values = number_missing_values
 
     def __str__(self):
         return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -76,15 +76,15 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.data_file = data_file
         self.features = {}
 
-        for xmlfeature in features['oml:feature']:
+        for idx, xmlfeature in enumerate(features['oml:feature']):
             feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
                                         xmlfeature['oml:name'],
                                         xmlfeature['oml:data_type'],
-                                        None) #todo add nominal values
+                                        None, #todo add nominal values (currently not in database)
+                                        int(xmlfeature['oml:number_of_missing_values']))
+            assert idx == feature.index, "Data features not provided in right order"
             self.features[feature.index] = feature
 
-        print("dataset %s initialized" %dataset_id)
-
 
         if data_file is not None:
             if self._data_features_supported():
@@ -309,6 +309,18 @@ def retrieve_class_labels(self, target_name='class'):
         else:
             return None
 
+    def get_features_by_type(self, data_type, exclude=None):
+        assert type(exclude) is list, "Exclude should be a list of indeces"
+        assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
+
+        result = []
+        for idx in self.features:
+            # in many cases we want to exclude, for example, the target feature
+            if idx not in exclude:
+                if self.features[idx].data_type == data_type:
+                    result.append(idx)
+        return result
+
     def publish(self):
         """Publish the dataset on the OpenML server.
 
diff --git a/openml/utils/preprocessing.py b/openml/utils/preprocessing.py
@@ -68,12 +68,12 @@ class ConditionalImputer(Imputer):
     """
     def __init__(self, missing_values="NaN", strategy="mean",
                  strategy_nominal="most_frequent",
-                 indexes_nominal=None,
+                 indeces_nominal=None,
                  axis=0, verbose=0, copy=True):
         self.missing_values = missing_values
         self.strategy = strategy
         self.strategy_nominal = strategy_nominal
-        self.indexes_nominal = indexes_nominal
+        self.indeces_nominal = indeces_nominal
         self.axis = axis
         self.verbose = verbose
         self.copy = copy
@@ -125,8 +125,8 @@ def fit(self, X, y=None):
 
         # here the indexes of nominal values get set
         self.statistics_ = statistics_general
-        if self.indexes_nominal is not None:
-            for i in self.indexes_nominal:
+        if self.indeces_nominal is not None:
+            for i in self.indeces_nominal:
                 self.statistics_[i] = statistics_nominal[i]
 
         return self
diff --git a/tests/test_utils/test_conditionalimputer.py b/tests/test_utils/test_conditionalimputer.py
@@ -0,0 +1,34 @@
+import openml, math, collections
+from openml.testing import TestBase
+from openml.utils.preprocessing import ConditionalImputer
+
+class OpenMLTaskTest(TestBase):
+
+    def test_impute_anneal(self):
+        task_id = 2
+
+        task = openml.tasks.get_task(task_id)
+        dataset = task.get_dataset()
+        X, _ = dataset.get_data(target=task.target_name)
+        nominal_indeces = dataset.get_features_by_type('nominal', exclude=[38])
+        clf = ConditionalImputer(strategy="median", strategy_nominal="most_frequent", indeces_nominal=nominal_indeces)
+        clf.fit(X)
+        X_prime = clf.transform(X)
+
+        correction = 0
+        for idx, value in enumerate(clf.statistics_):
+            if math.isnan(value):
+                # imputer can only give nan if all values are unknown
+                correction += 1
+                assert dataset.features[idx].number_missing_values == len(
+                    X), "Imputer calculated nan for usable feature"
+            else:
+                # check if nominal values get imputed correct
+                if idx in nominal_indeces:
+                    assert value == math.floor(value) == math.ceil(value), "Wrong impute value for nominal feature"
+
+                corrected_index = idx - correction  # for x prime
+                # check if imputation succeeded
+                counter = collections.Counter(X_prime[:, corrected_index])
+                occurances_after = counter[value]
+                assert occurances_after >= dataset.features[idx].number_missing_values