fix issue 373, PyOpenML throws an error when dataset contains string values

janvanrijn · janvanrijn · commit f4711f0d4ad0 · 2016-09-19T10:46:31.000+02:00
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -5,7 +5,7 @@
 from collections import OrderedDict
 import xmltodict
 from .dataset import OpenMLDataset
-from ..exceptions import OpenMLCacheException
+from ..exceptions import OpenMLCacheException, PyOpenMLError
 from .. import config
 from .._api_calls import _perform_api_call, _read_url
 
@@ -255,13 +255,19 @@ def get_dataset(dataset_id):
     try:
         description = _get_dataset_description(did_cache_dir, dataset_id)
         arff_file = _get_dataset_arff(did_cache_dir, description)
-        # TODO not used yet, figure out what to do with them...
         features = _get_dataset_features(did_cache_dir, dataset_id)
+        # TODO not used yet, figure out what to do with this...
         qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
     except Exception as e:
         _remove_dataset_cache_dir(did_cache_dir)
         raise e
 
+    for feature in features['oml:feature']:
+        if (feature['oml:data_type'] == 'string'):
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features: index ' +
+                                feature['oml:index'] + ', attribute name ' + feature['oml:name'])
+
+
     dataset = _create_dataset_from_description(description, arff_file)
     return dataset
 
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -11,7 +11,7 @@
 
 import openml
 from openml import OpenMLDataset
-from openml.exceptions import OpenMLCacheException
+from openml.exceptions import OpenMLCacheException, PyOpenMLError
 from openml.util import is_string
 from openml.testing import TestBase
 
@@ -20,7 +20,7 @@
                                        _get_dataset_description,
                                        _get_dataset_arff,
                                        _get_dataset_features,
-                                       _get_dataset_qualities)
+                                       _get_dataset_qualities, get_dataset)
 
 
 class TestOpenMLDataset(TestBase):
@@ -148,6 +148,9 @@ def test_get_dataset(self):
         self.assertTrue(os.path.exists(os.path.join(
             openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
 
+    def test_get_dataset_with_string(self):
+        self.assertRaises(PyOpenMLError, get_dataset, '373')
+
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(1571)
         X = dataset.get_data()
diff --git a/tests/tasks/test_task_functions.py b/tests/tasks/test_task_functions.py
@@ -50,17 +50,19 @@ def _check_task(self, task):
                       ['in_preparation', 'active', 'deactivated'])
 
     def test_list_tasks_by_type(self):
+        num_curves_tasks = 200 # number is flexible, check server if fails
         ttid=3
         tasks = openml.tasks.list_tasks(task_type_id=ttid)
-        self.assertGreaterEqual(len(tasks), 300)
+        self.assertGreaterEqual(len(tasks), num_curves_tasks)
         for tid in tasks:
             print(tasks[tid])
             self.assertEquals(ttid, tasks[tid]["ttid"])
             self._check_task(tasks[tid])
 
     def test_list_tasks_by_tag(self):
+        num_basic_tasks = 54 # number is flexible, check server if fails
         tasks = openml.tasks.list_tasks(tag='basic')
-        self.assertGreaterEqual(len(tasks), 57)
+        self.assertGreaterEqual(len(tasks), num_basic_tasks)
         for tid in tasks:
             self._check_task(tasks[tid])