Skip to content

Commit 82fb7b2

Browse files
committed
updated confitionalimputer to same codebase as sklearn
adapted test accordingly
1 parent 6ebbd13 commit 82fb7b2

3 files changed

Lines changed: 60 additions & 28 deletions

File tree

openml/datasets/data_feature.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,14 @@ class OpenMLDataFeature(object):
1818

1919
def __init__(self, index, name, data_type, nominal_values, number_missing_values):
2020
assert type(index) is int, "Index is of wrong datatype"
21-
assert type(name) is str, "Name is of wrong datatype"
22-
assert type(data_type) is str, "Data_type is of wrong datatype"
2321
assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
2422
if nominal_values is not None:
2523
assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
2624
assert type(number_missing_values) is int, "number_missing_values is of wrong datatype"
2725

2826
self.index = index
29-
self.name = name
30-
self.data_type = data_type
27+
self.name = str(name)
28+
self.data_type = str(data_type)
3129
self.nominal_values = nominal_values
3230
self.number_missing_values = number_missing_values
3331

openml/utils/preprocessing.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,14 @@ class ConditionalImputer(Imputer):
7676
def __init__(self, missing_values="NaN", strategy="mean",
7777
strategy_nominal="most_frequent",
7878
categorical_features=None,
79-
empty_attribute_constant=None,
79+
fill_empty=None,
8080
axis=0, verbose=0, copy=True):
8181
self.missing_values = missing_values
8282
self.strategy = strategy
8383
self.strategy_nominal = strategy_nominal
8484
self.categorical_features = categorical_features
8585
self.categorical_features_implied = None
86-
self.empty_attribute_constant = empty_attribute_constant
86+
self.fill_empty = fill_empty
8787
self.axis = axis
8888
self.verbose = verbose
8989
self.copy = copy
@@ -157,24 +157,48 @@ def fit(self, X, y=None):
157157

158158
def transform(self, X):
159159
"""Impute all missing values in X.
160+
160161
Parameters
161162
----------
162163
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
163164
The input data to complete.
164165
"""
165-
check_is_fitted(self, 'statistics_')
166-
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
167-
force_all_finite=False, copy=self.copy)
168-
statistics = self.statistics_
169-
if X.shape[1] != statistics.shape[0]:
170-
raise ValueError("X has %d features per sample, expected %d"
171-
% (X.shape[1], self.statistics_.shape[0]))
172-
173-
# impute completelly empty columns with constant
174-
if self.empty_attribute_constant is not None:
175-
invalid_mask = np.isnan(statistics)
176-
X[:, invalid_mask] = self.empty_attribute_constant
177-
statistics[invalid_mask] = self.empty_attribute_constant
166+
if self.axis == 0:
167+
check_is_fitted(self, 'statistics_')
168+
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
169+
force_all_finite=False, copy=self.copy)
170+
statistics = self.statistics_.copy()
171+
if X.shape[1] != statistics.shape[0]:
172+
raise ValueError("X has %d features per sample, expected %d"
173+
% (X.shape[1], self.statistics_.shape[0]))
174+
175+
# Since two different arrays can be provided in fit(X) and
176+
# transform(X), the imputation data need to be recomputed
177+
# when the imputation is done per sample
178+
else:
179+
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
180+
force_all_finite=False, copy=self.copy)
181+
182+
if sparse.issparse(X):
183+
statistics = self._sparse_fit(X,
184+
self.strategy,
185+
self.missing_values,
186+
self.axis)
187+
188+
else:
189+
statistics = self._dense_fit(X,
190+
self.strategy,
191+
self.missing_values,
192+
self.axis)
193+
194+
# impute completelly empty columns with constant, if
195+
# `fill_empty' parameter was set
196+
if self.fill_empty is not None:
197+
if sparse.issparse(X):
198+
X = X.toarray()
199+
empty_mask = np.all(_get_mask(X, self.missing_values),
200+
axis=self.axis)
201+
statistics[empty_mask] = self.fill_empty
178202

179203
# Delete the invalid rows/columns
180204
invalid_mask = np.isnan(statistics)
@@ -183,11 +207,14 @@ def transform(self, X):
183207
valid_statistics_indexes = np.where(valid_mask)[0]
184208
missing = np.arange(X.shape[not self.axis])[invalid_mask]
185209

186-
if invalid_mask.any():
210+
if self.axis == 0 and invalid_mask.any():
187211
if self.verbose:
188212
warnings.warn("Deleting features without "
189213
"observed values: %s" % missing)
190214
X = X[:, valid_statistics_indexes]
215+
elif self.axis == 1 and invalid_mask.any():
216+
raise ValueError("Some rows only contain "
217+
"missing values: %s" % missing)
191218

192219
# Do actual imputation
193220
if sparse.issparse(X) and self.missing_values != 0:
@@ -205,7 +232,10 @@ def transform(self, X):
205232
n_missing = np.sum(mask, axis=self.axis)
206233
values = np.repeat(valid_statistics, n_missing)
207234

208-
coordinates = np.where(mask.transpose())[::-1]
235+
if self.axis == 0:
236+
coordinates = np.where(mask.transpose())[::-1]
237+
else:
238+
coordinates = mask
209239

210240
X[coordinates] = values
211241

tests/test_utils/test_conditionalimputer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55

66
class OpenMLTaskTest(TestBase):
77

8-
def _do_test(self, dataset, X, nominal_indices, clf):
8+
def _do_test(self, dataset, X, nominal_indices, clf, fill_empty=None):
99
clf.fit(X)
1010
X_prime = clf.transform(X)
11+
assert np.isnan(np.min(X_prime)) == False, 'Result contains nans'
12+
if fill_empty is not None:
13+
assert X_prime.shape == X.shape
1114

1215
# in case of smart guessing nominal attributes, we accept false positives, but no false negatives
1316
for column_idx in nominal_indices:
@@ -18,7 +21,8 @@ def _do_test(self, dataset, X, nominal_indices, clf):
1821
for idx, value in enumerate(clf.statistics_):
1922
if math.isnan(value):
2023
# imputer can only give nan if all values are unknown
21-
correction += 1
24+
if fill_empty is None:
25+
correction += 1
2226
assert dataset.features[idx].number_missing_values == len(X), "Imputer calculated nan for usable feature"
2327
else:
2428
# check if nominal values get imputed correct
@@ -29,7 +33,8 @@ def _do_test(self, dataset, X, nominal_indices, clf):
2933
# check if imputation succeeded
3034
counter = collections.Counter(X_prime[:, corrected_index])
3135
occurances_after = counter[value]
32-
assert occurances_after >= dataset.features[idx].number_missing_values
36+
assert occurances_after >= dataset.features[idx].number_missing_values or \
37+
(fill_empty is not None and counter[fill_empty] == len(X)), "at attribute idx %d" %idx
3338

3439
return X_prime
3540

@@ -74,12 +79,11 @@ def test_impute_with_constant(self):
7479
dataset = task.get_dataset()
7580
X, _ = dataset.get_data(target=task.target_name)
7681
nominal_indices = dataset.get_features_by_type('nominal', exclude=[task.target_name])
82+
fill_empty = -1
7783
clf = ConditionalImputer(strategy="median",
7884
strategy_nominal="most_frequent",
7985
categorical_features=None,
8086
verbose=True,
81-
empty_attribute_constant=-1)
87+
fill_empty=fill_empty)
8288

83-
X_prime = self._do_test(dataset, X, nominal_indices, clf)
84-
assert np.isnan(np.min(X_prime)) == False, 'Result contains nans'
85-
assert X_prime.shape == X.shape
89+
self._do_test(dataset, X, nominal_indices, clf, fill_empty=fill_empty)

0 commit comments

Comments
 (0)