openml
diff --git a/‎openml/datasets/dataset.py‎
Lines changed: 49 additions & 30 deletions b/‎openml/datasets/dataset.py‎
Lines changed: 49 additions & 30 deletions
diff --git a/‎openml/datasets/functions.py‎
Lines changed: 23 additions & 5 deletions b/‎openml/datasets/functions.py‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎tests/datasets/test_datasets.py‎
Lines changed: 11 additions & 2 deletions b/‎tests/datasets/test_datasets.py‎
Lines changed: 11 additions & 2 deletions
@@ -10,6 +10,8 @@
 import scipy.sparse
 import xmltodict
 
+from ..exceptions import PyOpenMLError
+
 if sys.version_info[0] >= 3:
     import pickle
 else:
@@ -45,7 +47,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
                  row_id_attribute=None, ignore_attribute=None,
                  version_label=None, citation=None, tag=None, visibility=None,
                  original_data_url=None, paper_url=None, update_comment=None,
-                 md5_checksum=None, data_file=None):
+                 md5_checksum=None, data_file=None, features=None):
         # Attributes received by querying the RESTful API
         self.dataset_id = int(dataset_id) if dataset_id is not None else None
         self.name = name
@@ -71,38 +73,41 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.update_comment = update_comment
         self.md5_cheksum = md5_checksum
         self.data_file = data_file
+        self.features = features
+
         if data_file is not None:
-            self.data_pickle_file = data_file.replace('.arff', '.pkl')
+            if self._data_features_supported():
+                self.data_pickle_file = data_file.replace('.arff', '.pkl')
 
-            if os.path.exists(self.data_pickle_file):
-                logger.debug("Data pickle file already exists.")
-            else:
-                try:
-                    data = self._get_arff(self.format)
-                except OSError as e:
-                    logger.critical("Please check that the data file %s is there "
-                                    "and can be read.", self.data_file)
-                    raise e
-
-                categorical = [False if type(type_) != list else True
-                               for name, type_ in data['attributes']]
-                attribute_names = [name for name, type_ in data['attributes']]
-
-                if isinstance(data['data'], tuple):
-                    X = data['data']
-                    X_shape = (max(X[1]) + 1, max(X[2]) + 1)
-                    X = scipy.sparse.coo_matrix(
-                        (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
-                    X = X.tocsr()
-                elif isinstance(data['data'], list):
-                    X = np.array(data['data'], dtype=np.float32)
+                if os.path.exists(self.data_pickle_file):
+                    logger.debug("Data pickle file already exists.")
                 else:
-                    raise Exception()
-
-                with open(self.data_pickle_file, "wb") as fh:
-                    pickle.dump((X, categorical, attribute_names), fh, -1)
-                logger.debug("Saved dataset %d: %s to file %s" %
-                             (self.dataset_id, self.name, self.data_pickle_file))
+                    try:
+                        data = self._get_arff(self.format)
+                    except OSError as e:
+                        logger.critical("Please check that the data file %s is there "
+                                        "and can be read.", self.data_file)
+                        raise e
+
+                    categorical = [False if type(type_) != list else True
+                                   for name, type_ in data['attributes']]
+                    attribute_names = [name for name, type_ in data['attributes']]
+
+                    if isinstance(data['data'], tuple):
+                        X = data['data']
+                        X_shape = (max(X[1]) + 1, max(X[2]) + 1)
+                        X = scipy.sparse.coo_matrix(
+                            (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
+                        X = X.tocsr()
+                    elif isinstance(data['data'], list):
+                        X = np.array(data['data'], dtype=np.float32)
+                    else:
+                        raise Exception()
+
+                    with open(self.data_pickle_file, "wb") as fh:
+                        pickle.dump((X, categorical, attribute_names), fh, -1)
+                    logger.debug("Saved dataset %d: %s to file %s" %
+                                 (self.dataset_id, self.name, self.data_pickle_file))
 
     def __eq__(self, other):
         if type(other) != OpenMLDataset:
@@ -132,6 +137,9 @@ def _get_arff(self, format):
         # 32 bit system...currently 120mb (just a little bit more than covtype)
         import struct
 
+        if not self._data_features_supported():
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')
+
         filename = self.data_file
         bits = (8 * struct.calcsize("P"))
         if bits != 64 and os.path.getsize(filename) > 120000000:
@@ -172,6 +180,9 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
         """
         rval = []
 
+        if not self._data_features_supported():
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')
+
         path = self.data_pickle_file
         if not os.path.exists(path):
             raise ValueError("Cannot find a ndarray file for dataset %s at"
@@ -336,3 +347,11 @@ def _to_xml(self):
                 xml_dataset += "<oml:{0}>{1}</oml:{0}>\n".format(prop, content)
         xml_dataset += "</oml:data_set_description>"
         return xml_dataset
+
+    def _data_features_supported(self):
+        if self.features is not None:
+            for feature in self.features['oml:feature']:
+                if feature['oml:data_type'] not in ['numeric', 'nominal']:
+                    return False
+            return True
+        return True
@@ -72,7 +72,8 @@ def _get_cached_dataset(dataset_id):
     """
     description = _get_cached_dataset_description(dataset_id)
     arff_file = _get_cached_dataset_arff(dataset_id)
-    dataset = _create_dataset_from_description(description, arff_file)
+    features = _get_cached_dataset_features(dataset_id)
+    dataset = _create_dataset_from_description(description, features, arff_file)
 
     return dataset
 
@@ -93,6 +94,22 @@ def _get_cached_dataset_description(dataset_id):
     raise OpenMLCacheException("Dataset description for dataset id %d not "
                                "cached" % dataset_id)
 
+def _get_cached_dataset_features(dataset_id):
+    for cache_dir in [config.get_cache_directory(),
+                      config.get_private_directory()]:
+        did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+        features_file = os.path.join(did_cache_dir, "features.xml")
+        try:
+            with io.open(features_file, encoding='utf8') as fh:
+                features_xml = fh.read()
+        except (IOError, OSError):
+            continue
+
+        return xmltodict.parse(features_xml)["oml:data_features"]
+
+    raise OpenMLCacheException("Dataset features for dataset id %d not "
+                               "cached" % dataset_id)
+
 
 def _get_cached_dataset_arff(dataset_id):
     for cache_dir in [config.get_cache_directory(),
@@ -255,14 +272,14 @@ def get_dataset(dataset_id):
     try:
         description = _get_dataset_description(did_cache_dir, dataset_id)
         arff_file = _get_dataset_arff(did_cache_dir, description)
-        # TODO not used yet, figure out what to do with them...
         features = _get_dataset_features(did_cache_dir, dataset_id)
+        # TODO not used yet, figure out what to do with this...
         qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
     except Exception as e:
         _remove_dataset_cache_dir(did_cache_dir)
         raise e
 
-    dataset = _create_dataset_from_description(description, arff_file)
+    dataset = _create_dataset_from_description(description, features, arff_file)
     return dataset
 
 
@@ -463,7 +480,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
                              'Please do this manually!' % did_cache_dir)
 
 
-def _create_dataset_from_description(description, arff_file):
+def _create_dataset_from_description(description, features, arff_file):
     """Create a dataset object from a description dict.
 
     Parameters
@@ -502,5 +519,6 @@ def _create_dataset_from_description(description, arff_file):
         description.get("oml:paper_url"),
         description.get("oml:update_comment"),
         description.get("oml:md5_checksum"),
-        data_file=arff_file)
+        data_file=arff_file,
+        features=features)
     return dataset
@@ -11,16 +11,17 @@
 
 import openml
 from openml import OpenMLDataset
-from openml.exceptions import OpenMLCacheException
+from openml.exceptions import OpenMLCacheException, PyOpenMLError
 from openml.util import is_string
 from openml.testing import TestBase
 
 from openml.datasets.functions import (_get_cached_dataset,
+                                       _get_cached_dataset_features,
                                        _get_cached_datasets,
                                        _get_dataset_description,
                                        _get_dataset_arff,
                                        _get_dataset_features,
-                                       _get_dataset_qualities)
+                                       _get_dataset_qualities, get_dataset)
 
 
 class TestOpenMLDataset(TestBase):
@@ -44,7 +45,10 @@ def test__get_cached_datasets(self, _list_cached_datasets_mock):
     def test__get_cached_dataset(self, ):
         openml.config.set_cache_directory(self.static_cache_dir)
         dataset = _get_cached_dataset(2)
+        features = _get_cached_dataset_features(2)
         self.assertIsInstance(dataset, OpenMLDataset)
+        self.assertTrue(len(dataset.features) > 0)
+        self.assertTrue(len(dataset.features) == len(features))
 
     def test_get_chached_dataset_description(self):
         openml.config.set_cache_directory(self.static_cache_dir)
@@ -148,6 +152,11 @@ def test_get_dataset(self):
         self.assertTrue(os.path.exists(os.path.join(
             openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
 
+    def test_get_dataset_with_string(self):
+        dataset = openml.datasets.get_dataset(373)
+        self.assertRaises(PyOpenMLError, dataset._get_arff, 'arff')
+        self.assertRaises(PyOpenMLError, dataset.get_data)
+
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(1571)
         X = dataset.get_data()