Fix #129

janvanrijn · janvanrijn · commit e6835a7fb79f · 2016-09-27T10:22:33.000+02:00
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -10,6 +10,8 @@
 import scipy.sparse
 import xmltodict
 
+from ..exceptions import PyOpenMLError
+
 if sys.version_info[0] >= 3:
     import pickle
 else:
@@ -45,7 +47,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
                  row_id_attribute=None, ignore_attribute=None,
                  version_label=None, citation=None, tag=None, visibility=None,
                  original_data_url=None, paper_url=None, update_comment=None,
-                 md5_checksum=None, data_file=None):
+                 md5_checksum=None, data_file=None, features=None):
         # Attributes received by querying the RESTful API
         self.dataset_id = int(dataset_id) if dataset_id is not None else None
         self.name = name
@@ -71,38 +73,41 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.update_comment = update_comment
         self.md5_cheksum = md5_checksum
         self.data_file = data_file
+        self.features = features
+
         if data_file is not None:
-            self.data_pickle_file = data_file.replace('.arff', '.pkl')
+            if(self._data_contains_string_features() == False):
+                self.data_pickle_file = data_file.replace('.arff', '.pkl')
 
-            if os.path.exists(self.data_pickle_file):
-                logger.debug("Data pickle file already exists.")
-            else:
-                try:
-                    data = self._get_arff(self.format)
-                except OSError as e:
-                    logger.critical("Please check that the data file %s is there "
-                                    "and can be read.", self.data_file)
-                    raise e
-
-                categorical = [False if type(type_) != list else True
-                               for name, type_ in data['attributes']]
-                attribute_names = [name for name, type_ in data['attributes']]
-
-                if isinstance(data['data'], tuple):
-                    X = data['data']
-                    X_shape = (max(X[1]) + 1, max(X[2]) + 1)
-                    X = scipy.sparse.coo_matrix(
-                        (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
-                    X = X.tocsr()
-                elif isinstance(data['data'], list):
-                    X = np.array(data['data'], dtype=np.float32)
+                if os.path.exists(self.data_pickle_file):
+                    logger.debug("Data pickle file already exists.")
                 else:
-                    raise Exception()
-
-                with open(self.data_pickle_file, "wb") as fh:
-                    pickle.dump((X, categorical, attribute_names), fh, -1)
-                logger.debug("Saved dataset %d: %s to file %s" %
-                             (self.dataset_id, self.name, self.data_pickle_file))
+                    try:
+                        data = self._get_arff(self.format)
+                    except OSError as e:
+                        logger.critical("Please check that the data file %s is there "
+                                        "and can be read.", self.data_file)
+                        raise e
+
+                    categorical = [False if type(type_) != list else True
+                                   for name, type_ in data['attributes']]
+                    attribute_names = [name for name, type_ in data['attributes']]
+
+                    if isinstance(data['data'], tuple):
+                        X = data['data']
+                        X_shape = (max(X[1]) + 1, max(X[2]) + 1)
+                        X = scipy.sparse.coo_matrix(
+                            (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
+                        X = X.tocsr()
+                    elif isinstance(data['data'], list):
+                        X = np.array(data['data'], dtype=np.float32)
+                    else:
+                        raise Exception()
+
+                    with open(self.data_pickle_file, "wb") as fh:
+                        pickle.dump((X, categorical, attribute_names), fh, -1)
+                    logger.debug("Saved dataset %d: %s to file %s" %
+                                 (self.dataset_id, self.name, self.data_pickle_file))
 
     def __eq__(self, other):
         if type(other) != OpenMLDataset:
@@ -132,6 +137,9 @@ def _get_arff(self, format):
         # 32 bit system...currently 120mb (just a little bit more than covtype)
         import struct
 
+        if (self._data_contains_string_features()):
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')
+
         filename = self.data_file
         bits = (8 * struct.calcsize("P"))
         if bits != 64 and os.path.getsize(filename) > 120000000:
@@ -172,6 +180,9 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
         """
         rval = []
 
+        if (self._data_contains_string_features()):
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')
+
         path = self.data_pickle_file
         if not os.path.exists(path):
             raise ValueError("Cannot find a ndarray file for dataset %s at"
@@ -336,3 +347,11 @@ def _to_xml(self):
                 xml_dataset += "<oml:{0}>{1}</oml:{0}>\n".format(prop, content)
         xml_dataset += "</oml:data_set_description>"
         return xml_dataset
+
+    def _data_contains_string_features(self):
+        if (self.features is not  None):
+            for feature in self.features['oml:feature']:
+                if (feature['oml:data_type'] == 'string'):
+                    return True
+            return False
+        return False
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -5,7 +5,7 @@
 from collections import OrderedDict
 import xmltodict
 from .dataset import OpenMLDataset
-from ..exceptions import OpenMLCacheException, PyOpenMLError
+from ..exceptions import OpenMLCacheException
 from .. import config
 from .._api_calls import _perform_api_call, _read_url
 
@@ -72,7 +72,8 @@ def _get_cached_dataset(dataset_id):
     """
     description = _get_cached_dataset_description(dataset_id)
     arff_file = _get_cached_dataset_arff(dataset_id)
-    dataset = _create_dataset_from_description(description, arff_file)
+    features = _get_cached_dataset_features(dataset_id)
+    dataset = _create_dataset_from_description(description, features, arff_file)
 
     return dataset
 
@@ -93,6 +94,22 @@ def _get_cached_dataset_description(dataset_id):
     raise OpenMLCacheException("Dataset description for dataset id %d not "
                                "cached" % dataset_id)
 
+def _get_cached_dataset_features(dataset_id):
+    for cache_dir in [config.get_cache_directory(),
+                      config.get_private_directory()]:
+        did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+        features_file = os.path.join(did_cache_dir, "features.xml")
+        try:
+            with io.open(features_file, encoding='utf8') as fh:
+                features_xml = fh.read()
+        except (IOError, OSError):
+            continue
+
+        return xmltodict.parse(features_xml)["oml:data_features"]
+
+    raise OpenMLCacheException("Dataset features for dataset id %d not "
+                               "cached" % dataset_id)
+
 
 def _get_cached_dataset_arff(dataset_id):
     for cache_dir in [config.get_cache_directory(),
@@ -262,13 +279,7 @@ def get_dataset(dataset_id):
         _remove_dataset_cache_dir(did_cache_dir)
         raise e
 
-    for feature in features['oml:feature']:
-        if (feature['oml:data_type'] == 'string'):
-            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features: index ' +
-                                feature['oml:index'] + ', attribute name ' + feature['oml:name'])
-
-
-    dataset = _create_dataset_from_description(description, arff_file)
+    dataset = _create_dataset_from_description(description, features, arff_file)
     return dataset
 
 
@@ -469,7 +480,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
                              'Please do this manually!' % did_cache_dir)
 
 
-def _create_dataset_from_description(description, arff_file):
+def _create_dataset_from_description(description, features, arff_file):
     """Create a dataset object from a description dict.
 
     Parameters
@@ -508,5 +519,6 @@ def _create_dataset_from_description(description, arff_file):
         description.get("oml:paper_url"),
         description.get("oml:update_comment"),
         description.get("oml:md5_checksum"),
-        data_file=arff_file)
+        data_file=arff_file,
+        features=features)
     return dataset
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -149,7 +149,9 @@ def test_get_dataset(self):
             openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
 
     def test_get_dataset_with_string(self):
-        self.assertRaises(PyOpenMLError, get_dataset, '373')
+        dataset = openml.datasets.get_dataset(373)
+        self.assertRaises(PyOpenMLError, dataset._get_arff, 'arff')
+        self.assertRaises(PyOpenMLError, dataset.get_data)
 
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(1571)
diff --git a/tests/files/datasets/2/description.xml b/tests/files/datasets/2/description.xml
@@ -1,8 +1,12 @@
 <oml:data_set_description xmlns:oml="http://openml.org/openml">
-    <oml:id>2</oml:id>
-    <oml:name>anneal</oml:name>
-    <oml:version>1</oml:version>
-    <oml:description>1. Title of Database: Annealing Data
+  <oml:id>2</oml:id>
+  <oml:name>anneal</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**:   
+**Source**: Unknown -   
+**Please cite**:   
+
+1. Title of Database: Annealing Data
  
  2. Source Information: donated by David Sterling and Wray Buntine.
  
@@ -117,8 +121,10 @@
       U              34
                     ---
                     798</oml:description>
-    <oml:format>ARFF</oml:format>
-    			<oml:upload_date>2014-04-06 23:19:24</oml:upload_date>
-        <oml:licence>Public</oml:licence>    <oml:url>http://openml.liacs.nl/files/download/2/dataset_2_anneal.ORIG.arff</oml:url>
-    <oml:default_target_attribute>class</oml:default_target_attribute>        <oml:md5_checksum>939966a711925e333bf4aaadeaa71135</oml:md5_checksum>
+  <oml:format>ARFF</oml:format>
+  			<oml:upload_date>2014-04-06T23:19:24</oml:upload_date>
+    <oml:licence>Public</oml:licence>  <oml:url>http://www.openml.org/data/download/1666876/phpFsFYVN</oml:url>
+  
+  <oml:file_id>1666876</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>    <oml:tag>hallo</oml:tag><oml:tag>study_1</oml:tag><oml:tag>uci</oml:tag><oml:tag>welt</oml:tag>  <oml:visibility>public</oml:visibility>      <oml:status>active</oml:status>
+  <oml:md5_checksum>4eaed8b6ec9d8211024b6c089b064761</oml:md5_checksum>
 </oml:data_set_description>