Skip to content

Commit b8bb34b

Browse files
committed
Added supprt for data features,
added the conditional imputer (for our benchmark algorithms), setup for function that discriminates between feature types
1 parent 34efa1b commit b8bb34b

5 files changed

Lines changed: 180 additions & 4 deletions

File tree

openml/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .functions import (list_datasets, check_datasets_active,
22
get_datasets, get_dataset)
33
from .dataset import OpenMLDataset
4+
from .data_feature import OpenMLDataFeature
45

56
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
67
'OpenMLDataset', 'list_datasets']

openml/datasets/data_feature.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
class OpenMLDataFeature(object):
3+
"""Data Feature (a.k.a. Attribute) object.
4+
5+
Parameters
6+
----------
7+
index : int
8+
The index of this feature
9+
name : string
10+
Name of the feature
11+
data_type : string
12+
can be nominal, numeric, string, date (corresponds to arff)
13+
nominal_values : list(str)
14+
list of the possible values, in case of nominal attribute
15+
"""
16+
LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
17+
18+
def __init__(self, index, name, data_type, nominal_values):
19+
assert type(index) is int, "Index is of wrong datatype"
20+
assert type(name) is str, "Name is of wrong datatype"
21+
assert type(data_type) is str, "Data_type is of wrong datatype"
22+
assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
23+
if nominal_values is not None:
24+
assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
25+
26+
self.index = index
27+
self.name = name
28+
self.data_type = data_type
29+
self.nominal_values = nominal_values
30+
31+
def __str__(self):
32+
return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)

openml/datasets/dataset.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import scipy.sparse
1111
import xmltodict
1212

13+
from ..datasets.data_feature import OpenMLDataFeature
1314
from ..exceptions import PyOpenMLError
1415

1516
if sys.version_info[0] >= 3:
@@ -73,7 +74,17 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
7374
self.update_comment = update_comment
7475
self.md5_cheksum = md5_checksum
7576
self.data_file = data_file
76-
self.features = features
77+
self.features = {}
78+
79+
for xmlfeature in features['oml:feature']:
80+
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
81+
xmlfeature['oml:name'],
82+
xmlfeature['oml:data_type'],
83+
None) #todo add nominal values
84+
self.features[feature.index] = feature
85+
86+
print("dataset %s initialized" %dataset_id)
87+
7788

7889
if data_file is not None:
7990
if self._data_features_supported():
@@ -349,8 +360,8 @@ def _to_xml(self):
349360

350361
def _data_features_supported(self):
351362
if self.features is not None:
352-
for feature in self.features['oml:feature']:
353-
if feature['oml:data_type'] not in ['numeric', 'nominal']:
363+
for idx in self.features:
364+
if self.features[idx].data_type not in ['numeric', 'nominal']:
354365
return False
355366
return True
356367
return True

openml/utils/preprocessing.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
2+
from sklearn.preprocessing.imputation import Imputer, check_array, _get_mask, _most_frequent
3+
4+
import warnings
5+
6+
import numpy as np
7+
import numpy.ma as ma
8+
from scipy import sparse
9+
from sklearn.utils.fixes import astype
10+
from sklearn.utils.sparsefuncs import _get_median
11+
from sklearn.utils.validation import check_is_fitted
12+
from sklearn.utils.validation import FLOAT_DTYPES
13+
14+
15+
class ConditionalImputer(Imputer):
16+
"""Imputation transformer for completing missing values.
17+
18+
Read more in the :ref:`User Guide <imputation>`.
19+
20+
Parameters
21+
----------
22+
missing_values : integer or "NaN", optional (default="NaN")
23+
The placeholder for the missing values. All occurrences of
24+
`missing_values` will be imputed. For missing values encoded as np.nan,
25+
use the string value "NaN".
26+
27+
strategy : string, optional (default="mean")
28+
The imputation strategy.
29+
30+
- If "mean", then replace missing values using the mean along
31+
the axis.
32+
- If "median", then replace missing values using the median along
33+
the axis.
34+
- If "most_frequent", then replace missing using the most frequent
35+
value along the axis.
36+
37+
axis : integer, optional (default=0)
38+
The axis along which to impute.
39+
40+
- If `axis=0`, then impute along columns.
41+
- If `axis=1`, then impute along rows. (Not supported)
42+
43+
verbose : integer, optional (default=0)
44+
Controls the verbosity of the imputer.
45+
46+
copy : boolean, optional (default=True)
47+
If True, a copy of X will be created. If False, imputation will
48+
be done in-place whenever possible. Note that, in the following cases,
49+
a new copy will always be made, even if `copy=False`:
50+
51+
- If X is not an array of floating values;
52+
- If X is sparse and `missing_values=0`;
53+
- If `axis=0` and X is encoded as a CSR matrix;
54+
- If `axis=1` and X is encoded as a CSC matrix.
55+
56+
Attributes
57+
----------
58+
statistics_ : array of shape (n_features,)
59+
The imputation fill value for each feature if axis == 0.
60+
61+
Notes
62+
-----
63+
- When ``axis=0``, columns which only contained missing values at `fit`
64+
are discarded upon `transform`.
65+
- When ``axis=1``, an exception is raised if there are rows for which it is
66+
not possible to fill in the missing values (e.g., because they only
67+
contain missing values).
68+
"""
69+
def __init__(self, missing_values="NaN", strategy="mean",
70+
strategy_nominal="most_frequent",
71+
indexes_nominal=None,
72+
axis=0, verbose=0, copy=True):
73+
self.missing_values = missing_values
74+
self.strategy = strategy
75+
self.strategy_nominal = strategy_nominal
76+
self.indexes_nominal = indexes_nominal
77+
self.axis = axis
78+
self.verbose = verbose
79+
self.copy = copy
80+
81+
def fit(self, X, y=None):
82+
"""Fit the imputer on X.
83+
Parameters
84+
----------
85+
X : {array-like, sparse matrix}, shape (n_samples, n_features)
86+
Input data, where ``n_samples`` is the number of samples and
87+
``n_features`` is the number of features.
88+
Returns
89+
-------
90+
self : object
91+
Returns self.
92+
"""
93+
# Check parameters
94+
allowed_strategies = ["mean", "median", "most_frequent"]
95+
if self.strategy not in allowed_strategies:
96+
raise ValueError("Can only use these strategies: {0} "
97+
" got strategy={1}".format(allowed_strategies,
98+
self.strategy))
99+
100+
if self.axis not in [0]:
101+
raise ValueError("Can only impute missing values on axis 0 (axis 1 not supported), "
102+
" got axis={0}".format(self.axis))
103+
104+
X = check_array(X, accept_sparse='csc', dtype=np.float64,
105+
force_all_finite=False)
106+
107+
if sparse.issparse(X):
108+
statistics_general = self._sparse_fit(X,
109+
self.strategy,
110+
self.missing_values,
111+
self.axis)
112+
statistics_nominal = self._sparse_fit(X,
113+
self.strategy_nominal,
114+
self.missing_values,
115+
self.axis)
116+
else:
117+
statistics_general = self._dense_fit(X,
118+
self.strategy,
119+
self.missing_values,
120+
self.axis)
121+
statistics_nominal = self._dense_fit(X,
122+
self.strategy_nominal,
123+
self.missing_values,
124+
self.axis)
125+
126+
# here the indexes of nominal values get set
127+
self.statistics_ = statistics_general
128+
if self.indexes_nominal is not None:
129+
for i in self.indexes_nominal:
130+
self.statistics_[i] = statistics_nominal[i]
131+
132+
return self

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ liac-arff>=2.1.1dev
55
xmltodict
66
nose
77
requests
8-
scikit-learn
8+
scikit-learn>=0.18
99
nbformat

0 commit comments

Comments
 (0)