updated confitionalimputer to same codebase as sklearn

janvanrijn · janvanrijn · commit 82fb7b2315a7 · 2017-03-21T03:21:33.000+01:00
adapted test accordingly
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -18,16 +18,14 @@ class OpenMLDataFeature(object):
 
     def __init__(self, index, name, data_type, nominal_values, number_missing_values):
         assert type(index) is int, "Index is of wrong datatype"
-        assert type(name) is str, "Name is of wrong datatype"
-        assert type(data_type) is str, "Data_type is of wrong datatype"
         assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
         if nominal_values is not None:
             assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
         assert type(number_missing_values) is int, "number_missing_values is of wrong datatype"
 
         self.index = index
-        self.name = name
-        self.data_type = data_type
+        self.name = str(name)
+        self.data_type = str(data_type)
         self.nominal_values = nominal_values
         self.number_missing_values = number_missing_values
 
diff --git a/openml/utils/preprocessing.py b/openml/utils/preprocessing.py
@@ -76,14 +76,14 @@ class ConditionalImputer(Imputer):
     def __init__(self, missing_values="NaN", strategy="mean",
                  strategy_nominal="most_frequent",
                  categorical_features=None,
-                 empty_attribute_constant=None,
+                 fill_empty=None,
                  axis=0, verbose=0, copy=True):
         self.missing_values = missing_values
         self.strategy = strategy
         self.strategy_nominal = strategy_nominal
         self.categorical_features = categorical_features
         self.categorical_features_implied = None
-        self.empty_attribute_constant = empty_attribute_constant
+        self.fill_empty = fill_empty
         self.axis = axis
         self.verbose = verbose
         self.copy = copy
@@ -157,24 +157,48 @@ def fit(self, X, y=None):
 
     def transform(self, X):
         """Impute all missing values in X.
+
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
             The input data to complete.
         """
-        check_is_fitted(self, 'statistics_')
-        X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
-                        force_all_finite=False, copy=self.copy)
-        statistics = self.statistics_
-        if X.shape[1] != statistics.shape[0]:
-            raise ValueError("X has %d features per sample, expected %d"
-                             % (X.shape[1], self.statistics_.shape[0]))
-
-        # impute completelly empty columns with constant
-        if self.empty_attribute_constant is not None:
-            invalid_mask = np.isnan(statistics)
-            X[:, invalid_mask] = self.empty_attribute_constant
-            statistics[invalid_mask] = self.empty_attribute_constant
+        if self.axis == 0:
+            check_is_fitted(self, 'statistics_')
+            X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
+                            force_all_finite=False, copy=self.copy)
+            statistics = self.statistics_.copy()
+            if X.shape[1] != statistics.shape[0]:
+                raise ValueError("X has %d features per sample, expected %d"
+                                 % (X.shape[1], self.statistics_.shape[0]))
+
+        # Since two different arrays can be provided in fit(X) and
+        # transform(X), the imputation data need to be recomputed
+        # when the imputation is done per sample
+        else:
+            X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
+                            force_all_finite=False, copy=self.copy)
+
+            if sparse.issparse(X):
+                statistics = self._sparse_fit(X,
+                                              self.strategy,
+                                              self.missing_values,
+                                              self.axis)
+
+            else:
+                statistics = self._dense_fit(X,
+                                             self.strategy,
+                                             self.missing_values,
+                                             self.axis)
+
+        # impute completelly empty columns with constant, if
+        # `fill_empty' parameter was set
+        if self.fill_empty is not None:
+            if sparse.issparse(X):
+                X = X.toarray()
+            empty_mask = np.all(_get_mask(X, self.missing_values),
+                                axis=self.axis)
+            statistics[empty_mask] = self.fill_empty
 
         # Delete the invalid rows/columns
         invalid_mask = np.isnan(statistics)
@@ -183,11 +207,14 @@ def transform(self, X):
         valid_statistics_indexes = np.where(valid_mask)[0]
         missing = np.arange(X.shape[not self.axis])[invalid_mask]
 
-        if invalid_mask.any():
+        if self.axis == 0 and invalid_mask.any():
             if self.verbose:
                 warnings.warn("Deleting features without "
                               "observed values: %s" % missing)
             X = X[:, valid_statistics_indexes]
+        elif self.axis == 1 and invalid_mask.any():
+            raise ValueError("Some rows only contain "
+                             "missing values: %s" % missing)
 
         # Do actual imputation
         if sparse.issparse(X) and self.missing_values != 0:
@@ -205,7 +232,10 @@ def transform(self, X):
             n_missing = np.sum(mask, axis=self.axis)
             values = np.repeat(valid_statistics, n_missing)
 
-            coordinates = np.where(mask.transpose())[::-1]
+            if self.axis == 0:
+                coordinates = np.where(mask.transpose())[::-1]
+            else:
+                coordinates = mask
 
             X[coordinates] = values
 
diff --git a/tests/test_utils/test_conditionalimputer.py b/tests/test_utils/test_conditionalimputer.py
@@ -5,9 +5,12 @@
 
 class OpenMLTaskTest(TestBase):
 
-    def _do_test(self, dataset, X, nominal_indices, clf):
+    def _do_test(self, dataset, X, nominal_indices, clf, fill_empty=None):
         clf.fit(X)
         X_prime = clf.transform(X)
+        assert np.isnan(np.min(X_prime)) == False, 'Result contains nans'
+        if fill_empty is not None:
+            assert X_prime.shape == X.shape
 
         # in case of smart guessing nominal attributes, we accept false positives, but no false negatives
         for column_idx in nominal_indices:
@@ -18,7 +21,8 @@ def _do_test(self, dataset, X, nominal_indices, clf):
         for idx, value in enumerate(clf.statistics_):
             if math.isnan(value):
                 # imputer can only give nan if all values are unknown
-                correction += 1
+                if fill_empty is None:
+                    correction += 1
                 assert dataset.features[idx].number_missing_values == len(X), "Imputer calculated nan for usable feature"
             else:
                 # check if nominal values get imputed correct
@@ -29,7 +33,8 @@ def _do_test(self, dataset, X, nominal_indices, clf):
                 # check if imputation succeeded
                 counter = collections.Counter(X_prime[:, corrected_index])
                 occurances_after = counter[value]
-                assert occurances_after >= dataset.features[idx].number_missing_values
+                assert occurances_after >= dataset.features[idx].number_missing_values or \
+                       (fill_empty is not None and counter[fill_empty] == len(X)), "at attribute idx %d" %idx
 
         return X_prime
 
@@ -74,12 +79,11 @@ def test_impute_with_constant(self):
             dataset = task.get_dataset()
             X, _ = dataset.get_data(target=task.target_name)
             nominal_indices = dataset.get_features_by_type('nominal', exclude=[task.target_name])
+            fill_empty = -1
             clf = ConditionalImputer(strategy="median",
                                      strategy_nominal="most_frequent",
                                      categorical_features=None,
                                      verbose=True,
-                                     empty_attribute_constant=-1)
+                                     fill_empty=fill_empty)
 
-            X_prime = self._do_test(dataset, X, nominal_indices, clf)
-            assert np.isnan(np.min(X_prime)) == False, 'Result contains nans'
-            assert X_prime.shape == X.shape
+            self._do_test(dataset, X, nominal_indices, clf, fill_empty=fill_empty)