Skip to content

Commit 7d24294

Browse files
committed
finished conditional imputer with tests
1 parent b8bb34b commit 7d24294

4 files changed

Lines changed: 58 additions & 9 deletions

File tree

openml/datasets/data_feature.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,24 @@ class OpenMLDataFeature(object):
1212
can be nominal, numeric, string, date (corresponds to arff)
1313
nominal_values : list(str)
1414
list of the possible values, in case of nominal attribute
15+
number_missing_values : int
1516
"""
1617
LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
1718

18-
def __init__(self, index, name, data_type, nominal_values):
19+
def __init__(self, index, name, data_type, nominal_values, number_missing_values):
1920
assert type(index) is int, "Index is of wrong datatype"
2021
assert type(name) is str, "Name is of wrong datatype"
2122
assert type(data_type) is str, "Data_type is of wrong datatype"
2223
assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
2324
if nominal_values is not None:
2425
assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
26+
assert type(number_missing_values) is int, "number_missing_values is of wrong datatype"
2527

2628
self.index = index
2729
self.name = name
2830
self.data_type = data_type
2931
self.nominal_values = nominal_values
32+
self.number_missing_values = number_missing_values
3033

3134
def __str__(self):
3235
return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)

openml/datasets/dataset.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,15 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
7676
self.data_file = data_file
7777
self.features = {}
7878

79-
for xmlfeature in features['oml:feature']:
79+
for idx, xmlfeature in enumerate(features['oml:feature']):
8080
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
8181
xmlfeature['oml:name'],
8282
xmlfeature['oml:data_type'],
83-
None) #todo add nominal values
83+
None, #todo add nominal values (currently not in database)
84+
int(xmlfeature['oml:number_of_missing_values']))
85+
assert idx == feature.index, "Data features not provided in right order"
8486
self.features[feature.index] = feature
8587

86-
print("dataset %s initialized" %dataset_id)
87-
8888

8989
if data_file is not None:
9090
if self._data_features_supported():
@@ -309,6 +309,18 @@ def retrieve_class_labels(self, target_name='class'):
309309
else:
310310
return None
311311

312+
def get_features_by_type(self, data_type, exclude=None):
313+
assert type(exclude) is list, "Exclude should be a list of indeces"
314+
assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
315+
316+
result = []
317+
for idx in self.features:
318+
# in many cases we want to exclude, for example, the target feature
319+
if idx not in exclude:
320+
if self.features[idx].data_type == data_type:
321+
result.append(idx)
322+
return result
323+
312324
def publish(self):
313325
"""Publish the dataset on the OpenML server.
314326

openml/utils/preprocessing.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,12 @@ class ConditionalImputer(Imputer):
6868
"""
6969
def __init__(self, missing_values="NaN", strategy="mean",
7070
strategy_nominal="most_frequent",
71-
indexes_nominal=None,
71+
indeces_nominal=None,
7272
axis=0, verbose=0, copy=True):
7373
self.missing_values = missing_values
7474
self.strategy = strategy
7575
self.strategy_nominal = strategy_nominal
76-
self.indexes_nominal = indexes_nominal
76+
self.indeces_nominal = indeces_nominal
7777
self.axis = axis
7878
self.verbose = verbose
7979
self.copy = copy
@@ -125,8 +125,8 @@ def fit(self, X, y=None):
125125

126126
# here the indexes of nominal values get set
127127
self.statistics_ = statistics_general
128-
if self.indexes_nominal is not None:
129-
for i in self.indexes_nominal:
128+
if self.indeces_nominal is not None:
129+
for i in self.indeces_nominal:
130130
self.statistics_[i] = statistics_nominal[i]
131131

132132
return self
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import openml, math, collections
2+
from openml.testing import TestBase
3+
from openml.utils.preprocessing import ConditionalImputer
4+
5+
class OpenMLTaskTest(TestBase):
6+
7+
def test_impute_anneal(self):
8+
task_id = 2
9+
10+
task = openml.tasks.get_task(task_id)
11+
dataset = task.get_dataset()
12+
X, _ = dataset.get_data(target=task.target_name)
13+
nominal_indeces = dataset.get_features_by_type('nominal', exclude=[38])
14+
clf = ConditionalImputer(strategy="median", strategy_nominal="most_frequent", indeces_nominal=nominal_indeces)
15+
clf.fit(X)
16+
X_prime = clf.transform(X)
17+
18+
correction = 0
19+
for idx, value in enumerate(clf.statistics_):
20+
if math.isnan(value):
21+
# imputer can only give nan if all values are unknown
22+
correction += 1
23+
assert dataset.features[idx].number_missing_values == len(
24+
X), "Imputer calculated nan for usable feature"
25+
else:
26+
# check if nominal values get imputed correct
27+
if idx in nominal_indeces:
28+
assert value == math.floor(value) == math.ceil(value), "Wrong impute value for nominal feature"
29+
30+
corrected_index = idx - correction # for x prime
31+
# check if imputation succeeded
32+
counter = collections.Counter(X_prime[:, corrected_index])
33+
occurances_after = counter[value]
34+
assert occurances_after >= dataset.features[idx].number_missing_values

0 commit comments

Comments
 (0)