Added supprt for data features,

janvanrijn · janvanrijn · commit b8bb34b5b27f · 2017-03-05T16:49:51.000+01:00
added the conditional imputer (for our benchmark algorithms),
setup for function that discriminates between feature types
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,6 +1,7 @@
 from .functions import (list_datasets, check_datasets_active,
                         get_datasets, get_dataset)
 from .dataset import OpenMLDataset
+from .data_feature import OpenMLDataFeature
 
 __all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
            'OpenMLDataset', 'list_datasets']
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -0,0 +1,32 @@
+
+class OpenMLDataFeature(object):
+    """Data Feature (a.k.a. Attribute) object.
+
+       Parameters
+       ----------
+       index : int
+            The index of this feature
+        name : string
+            Name of the feature
+        data_type : string
+            can be nominal, numeric, string, date (corresponds to arff)
+        nominal_values : list(str)
+            list of the possible values, in case of nominal attribute
+       """
+    LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
+
+    def __init__(self, index, name, data_type, nominal_values):
+        assert type(index) is int, "Index is of wrong datatype"
+        assert type(name) is str, "Name is of wrong datatype"
+        assert type(data_type) is str, "Data_type is of wrong datatype"
+        assert data_type in self.LEGAL_DATA_TYPES, "data type should be in %s" %str(self.LEGAL_DATA_TYPES)
+        if nominal_values is not None:
+            assert type(nominal_values) is list, "Nominal_values is of wrong datatype"
+
+        self.index = index
+        self.name = name
+        self.data_type = data_type
+        self.nominal_values = nominal_values
+
+    def __str__(self):
+        return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -10,6 +10,7 @@
 import scipy.sparse
 import xmltodict
 
+from ..datasets.data_feature import OpenMLDataFeature
 from ..exceptions import PyOpenMLError
 
 if sys.version_info[0] >= 3:
@@ -73,7 +74,17 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.update_comment = update_comment
         self.md5_cheksum = md5_checksum
         self.data_file = data_file
-        self.features = features
+        self.features = {}
+
+        for xmlfeature in features['oml:feature']:
+            feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
+                                        xmlfeature['oml:name'],
+                                        xmlfeature['oml:data_type'],
+                                        None) #todo add nominal values
+            self.features[feature.index] = feature
+
+        print("dataset %s initialized" %dataset_id)
+
 
         if data_file is not None:
             if self._data_features_supported():
@@ -349,8 +360,8 @@ def _to_xml(self):
 
     def _data_features_supported(self):
         if self.features is not None:
-            for feature in self.features['oml:feature']:
-                if feature['oml:data_type'] not in ['numeric', 'nominal']:
+            for idx in self.features:
+                if self.features[idx].data_type not in ['numeric', 'nominal']:
                     return False
             return True
         return True
diff --git a/openml/utils/preprocessing.py b/openml/utils/preprocessing.py
@@ -0,0 +1,132 @@
+
+from sklearn.preprocessing.imputation import Imputer, check_array, _get_mask, _most_frequent
+
+import warnings
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from sklearn.utils.fixes import astype
+from sklearn.utils.sparsefuncs import _get_median
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES
+
+
+class ConditionalImputer(Imputer):
+    """Imputation transformer for completing missing values.
+
+    Read more in the :ref:`User Guide <imputation>`.
+
+    Parameters
+    ----------
+    missing_values : integer or "NaN", optional (default="NaN")
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For missing values encoded as np.nan,
+        use the string value "NaN".
+
+    strategy : string, optional (default="mean")
+        The imputation strategy.
+
+        - If "mean", then replace missing values using the mean along
+          the axis.
+        - If "median", then replace missing values using the median along
+          the axis.
+        - If "most_frequent", then replace missing using the most frequent
+          value along the axis.
+
+    axis : integer, optional (default=0)
+        The axis along which to impute.
+
+        - If `axis=0`, then impute along columns.
+        - If `axis=1`, then impute along rows. (Not supported)
+
+    verbose : integer, optional (default=0)
+        Controls the verbosity of the imputer.
+
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible. Note that, in the following cases,
+        a new copy will always be made, even if `copy=False`:
+
+        - If X is not an array of floating values;
+        - If X is sparse and `missing_values=0`;
+        - If `axis=0` and X is encoded as a CSR matrix;
+        - If `axis=1` and X is encoded as a CSC matrix.
+
+    Attributes
+    ----------
+    statistics_ : array of shape (n_features,)
+        The imputation fill value for each feature if axis == 0.
+
+    Notes
+    -----
+    - When ``axis=0``, columns which only contained missing values at `fit`
+      are discarded upon `transform`.
+    - When ``axis=1``, an exception is raised if there are rows for which it is
+      not possible to fill in the missing values (e.g., because they only
+      contain missing values).
+    """
+    def __init__(self, missing_values="NaN", strategy="mean",
+                 strategy_nominal="most_frequent",
+                 indexes_nominal=None,
+                 axis=0, verbose=0, copy=True):
+        self.missing_values = missing_values
+        self.strategy = strategy
+        self.strategy_nominal = strategy_nominal
+        self.indexes_nominal = indexes_nominal
+        self.axis = axis
+        self.verbose = verbose
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # Check parameters
+        allowed_strategies = ["mean", "median", "most_frequent"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Can only use these strategies: {0} "
+                             " got strategy={1}".format(allowed_strategies,
+                                                        self.strategy))
+
+        if self.axis not in [0]:
+            raise ValueError("Can only impute missing values on axis 0 (axis 1 not supported), "
+                             " got axis={0}".format(self.axis))
+
+        X = check_array(X, accept_sparse='csc', dtype=np.float64,
+                        force_all_finite=False)
+
+        if sparse.issparse(X):
+            statistics_general = self._sparse_fit(X,
+                                                  self.strategy,
+                                                  self.missing_values,
+                                                  self.axis)
+            statistics_nominal = self._sparse_fit(X,
+                                                  self.strategy_nominal,
+                                                  self.missing_values,
+                                                  self.axis)
+        else:
+            statistics_general = self._dense_fit(X,
+                                                 self.strategy,
+                                                 self.missing_values,
+                                                 self.axis)
+            statistics_nominal = self._dense_fit(X,
+                                                  self.strategy_nominal,
+                                                  self.missing_values,
+                                                  self.axis)
+
+        # here the indexes of nominal values get set
+        self.statistics_ = statistics_general
+        if self.indexes_nominal is not None:
+            for i in self.indexes_nominal:
+                self.statistics_[i] = statistics_nominal[i]
+
+        return self
diff --git a/requirements.txt b/requirements.txt
@@ -5,5 +5,5 @@ liac-arff>=2.1.1dev
 xmltodict
 nose
 requests
-scikit-learn
+scikit-learn>=0.18
 nbformat