Skip to content

Commit 09f6ff4

Browse files
committed
made imputer add constant instead for removed columns
added testcase
1 parent 676b560 commit 09f6ff4

3 files changed

Lines changed: 146 additions & 23 deletions

File tree

openml/datasets/dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,13 +312,14 @@ def retrieve_class_labels(self, target_name='class'):
312312
return None
313313

314314
def get_features_by_type(self, data_type, exclude=None):
315-
assert type(exclude) is list, "Exclude should be a list of indeces"
316315
assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
316+
if exclude is not None:
317+
assert type(exclude) is list, "Exclude should be a list of indeces"
317318

318319
result = []
319320
for idx in self.features:
320321
# in many cases we want to exclude, for example, the target feature
321-
if idx not in exclude:
322+
if exclude is None or idx not in exclude:
322323
if self.features[idx].data_type == data_type:
323324
result.append(idx)
324325
return result

openml/utils/preprocessing.py

Lines changed: 91 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1-
2-
from sklearn.preprocessing.imputation import Imputer, check_array, _get_mask, _most_frequent
1+
from scipy.integrate.tests.test_bvp import emden_bc
2+
from sklearn.preprocessing.imputation import Imputer, _get_mask
33

44
import warnings
5-
5+
import math
66
import numpy as np
7-
import numpy.ma as ma
87
from scipy import sparse
8+
9+
from sklearn.utils import check_array
910
from sklearn.utils.fixes import astype
1011
from sklearn.utils.sparsefuncs import _get_median
1112
from sklearn.utils.validation import check_is_fitted
1213
from sklearn.utils.validation import FLOAT_DTYPES
1314

1415

16+
1517
class ConditionalImputer(Imputer):
1618
"""Imputation transformer for completing missing values.
1719
@@ -34,6 +36,13 @@ class ConditionalImputer(Imputer):
3436
- If "most_frequent", then replace missing using the most frequent
3537
value along the axis.
3638
39+
strategy_nominal : string, optional (default="most_frequent")
40+
The imputation strategy for nominal attributes. For values, see "strategy"
41+
42+
indices_nominal : list (int)
43+
An array of indices determining which are treated as nominal. If None,
44+
the Conditional Imputer will guess based on the values
45+
3746
axis : integer, optional (default=0)
3847
The axis along which to impute.
3948
@@ -68,12 +77,15 @@ class ConditionalImputer(Imputer):
6877
"""
6978
def __init__(self, missing_values="NaN", strategy="mean",
7079
strategy_nominal="most_frequent",
71-
indeces_nominal=None,
80+
categorical_features=None,
81+
empty_attribute_constant=None,
7282
axis=0, verbose=0, copy=True):
7383
self.missing_values = missing_values
7484
self.strategy = strategy
7585
self.strategy_nominal = strategy_nominal
76-
self.indeces_nominal = indeces_nominal
86+
self.categorical_features = categorical_features
87+
self.categorical_features_implied = None
88+
self.empty_attribute_constant = empty_attribute_constant
7789
self.axis = axis
7890
self.verbose = verbose
7991
self.copy = copy
@@ -125,8 +137,78 @@ def fit(self, X, y=None):
125137

126138
# here the indexes of nominal values get set
127139
self.statistics_ = statistics_general
128-
if self.indeces_nominal is not None:
129-
for i in self.indeces_nominal:
140+
if self.categorical_features is not None:
141+
for i in self.categorical_features:
130142
self.statistics_[i] = statistics_nominal[i]
143+
else:
144+
# iterate over all attributes
145+
self.categorical_features_implied = []
146+
for iAtt in range(len(statistics_general)):
147+
isNominal = True
148+
for iInst in range(len(X)):
149+
if not np.isnan(X[iInst][iAtt]) and math.floor(X[iInst][iAtt]) != X[iInst][iAtt]:
150+
isNominal = False
151+
break
152+
if isNominal:
153+
# book keeping, for testing purposes
154+
self.categorical_features_implied.append(iAtt)
155+
self.statistics_[iAtt] = statistics_nominal[iAtt]
156+
157+
return self
158+
159+
160+
def transform(self, X):
161+
"""Impute all missing values in X.
162+
Parameters
163+
----------
164+
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
165+
The input data to complete.
166+
"""
167+
check_is_fitted(self, 'statistics_')
168+
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
169+
force_all_finite=False, copy=self.copy)
170+
statistics = self.statistics_
171+
if X.shape[1] != statistics.shape[0]:
172+
raise ValueError("X has %d features per sample, expected %d"
173+
% (X.shape[1], self.statistics_.shape[0]))
174+
175+
# impute completelly empty columns with constant
176+
if self.empty_attribute_constant is not None:
177+
invalid_mask = np.isnan(statistics)
178+
X[:, invalid_mask] = self.empty_attribute_constant
179+
self.statistics_[invalid_mask] = self.empty_attribute_constant
180+
181+
# Delete the invalid rows/columns
182+
invalid_mask = np.isnan(statistics)
183+
valid_mask = np.logical_not(invalid_mask)
184+
valid_statistics = statistics[valid_mask]
185+
valid_statistics_indexes = np.where(valid_mask)[0]
186+
missing = np.arange(X.shape[not self.axis])[invalid_mask]
187+
188+
if invalid_mask.any():
189+
if self.verbose:
190+
warnings.warn("Deleting features without "
191+
"observed values: %s" % missing)
192+
X = X[:, valid_statistics_indexes]
193+
194+
# Do actual imputation
195+
if sparse.issparse(X) and self.missing_values != 0:
196+
mask = _get_mask(X.data, self.missing_values)
197+
indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
198+
np.diff(X.indptr))[mask]
199+
200+
X.data[mask] = astype(valid_statistics[indexes], X.dtype,
201+
copy=False)
202+
else:
203+
if sparse.issparse(X):
204+
X = X.toarray()
205+
206+
mask = _get_mask(X, self.missing_values)
207+
n_missing = np.sum(mask, axis=self.axis)
208+
values = np.repeat(valid_statistics, n_missing)
209+
210+
coordinates = np.where(mask.transpose())[::-1]
211+
212+
X[coordinates] = values
131213

132-
return self
214+
return X
Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,74 @@
11
import openml, math, collections
2+
import numpy as np
23
from openml.testing import TestBase
34
from openml.utils.preprocessing import ConditionalImputer
45

56
class OpenMLTaskTest(TestBase):
67

7-
def test_impute_anneal(self):
8-
task_id = 2
9-
10-
task = openml.tasks.get_task(task_id)
11-
dataset = task.get_dataset()
12-
X, _ = dataset.get_data(target=task.target_name)
13-
nominal_indeces = dataset.get_features_by_type('nominal', exclude=[38])
14-
clf = ConditionalImputer(strategy="median", strategy_nominal="most_frequent", indeces_nominal=nominal_indeces)
8+
def _do_test(self, dataset, X, nominal_indices, clf):
159
clf.fit(X)
1610
X_prime = clf.transform(X)
1711

12+
# in case of smart guessing nominal attributes, we accept false positives, but no false negatives
13+
for column_idx in nominal_indices:
14+
if clf.categorical_features_implied is not None:
15+
assert column_idx in clf.categorical_features_implied, "False negative with smart nominal detector"
16+
1817
correction = 0
1918
for idx, value in enumerate(clf.statistics_):
2019
if math.isnan(value):
2120
# imputer can only give nan if all values are unknown
2221
correction += 1
23-
assert dataset.features[idx].number_missing_values == len(
24-
X), "Imputer calculated nan for usable feature"
22+
assert dataset.features[idx].number_missing_values == len(X), "Imputer calculated nan for usable feature"
2523
else:
2624
# check if nominal values get imputed correct
27-
if idx in nominal_indeces:
25+
if idx in nominal_indices:
2826
assert value == math.floor(value) == math.ceil(value), "Wrong impute value for nominal feature"
2927

3028
corrected_index = idx - correction # for x prime
3129
# check if imputation succeeded
3230
counter = collections.Counter(X_prime[:, corrected_index])
3331
occurances_after = counter[value]
34-
assert occurances_after >= dataset.features[idx].number_missing_values
32+
assert occurances_after >= dataset.features[idx].number_missing_values
33+
34+
return X_prime
35+
36+
def test_impute_indices(self):
37+
task_ids = [2,59]
38+
39+
for task_id in task_ids:
40+
task = openml.tasks.get_task(task_id)
41+
dataset = task.get_dataset()
42+
X, _ = dataset.get_data(target=task.target_name)
43+
nominal_indices = dataset.get_features_by_type('nominal', exclude=[len(dataset.features)-1])
44+
clf = ConditionalImputer(strategy="median", strategy_nominal="most_frequent", categorical_features=nominal_indices, verbose=True)
45+
46+
self._do_test(dataset, X, nominal_indices, clf)
47+
48+
49+
def test_impute_smart(self):
50+
task_ids = [2,59]
51+
52+
for task_id in task_ids:
53+
task = openml.tasks.get_task(task_id)
54+
dataset = task.get_dataset()
55+
X, _ = dataset.get_data(target=task.target_name)
56+
nominal_indices = dataset.get_features_by_type('nominal', exclude=[len(dataset.features)-1])
57+
clf = ConditionalImputer(strategy="median", strategy_nominal="most_frequent", categorical_features=None, verbose=True)
58+
59+
self._do_test(dataset, X, nominal_indices, clf)
60+
61+
def test_impute_with_constant(self):
62+
task_ids = [2]
63+
64+
for task_id in task_ids:
65+
task = openml.tasks.get_task(task_id)
66+
dataset = task.get_dataset()
67+
X, _ = dataset.get_data(target=task.target_name)
68+
nominal_indices = dataset.get_features_by_type('nominal', exclude=[len(dataset.features) - 1])
69+
clf = ConditionalImputer(strategy="median", strategy_nominal="most_frequent", categorical_features=None,
70+
verbose=True, empty_attribute_constant=-1)
71+
72+
X_prime = self._do_test(dataset, X, nominal_indices, clf)
73+
assert np.isnan(np.min(X_prime)) == False, 'Result contains nans'
74+
assert X_prime.shape == X.shape

0 commit comments

Comments
 (0)