Skip to content

Commit f4711f0

Browse files
committed
fix issue 373, PyOpenML throws an error when dataset contains string values
1 parent 4c87829 commit f4711f0

3 files changed

Lines changed: 17 additions & 6 deletions

File tree

openml/datasets/functions.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from collections import OrderedDict
66
import xmltodict
77
from .dataset import OpenMLDataset
8-
from ..exceptions import OpenMLCacheException
8+
from ..exceptions import OpenMLCacheException, PyOpenMLError
99
from .. import config
1010
from .._api_calls import _perform_api_call, _read_url
1111

@@ -255,13 +255,19 @@ def get_dataset(dataset_id):
255255
try:
256256
description = _get_dataset_description(did_cache_dir, dataset_id)
257257
arff_file = _get_dataset_arff(did_cache_dir, description)
258-
# TODO not used yet, figure out what to do with them...
259258
features = _get_dataset_features(did_cache_dir, dataset_id)
259+
# TODO not used yet, figure out what to do with this...
260260
qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
261261
except Exception as e:
262262
_remove_dataset_cache_dir(did_cache_dir)
263263
raise e
264264

265+
for feature in features['oml:feature']:
266+
if (feature['oml:data_type'] == 'string'):
267+
raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features: index ' +
268+
feature['oml:index'] + ', attribute name ' + feature['oml:name'])
269+
270+
265271
dataset = _create_dataset_from_description(description, arff_file)
266272
return dataset
267273

tests/datasets/test_datasets.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import openml
1313
from openml import OpenMLDataset
14-
from openml.exceptions import OpenMLCacheException
14+
from openml.exceptions import OpenMLCacheException, PyOpenMLError
1515
from openml.util import is_string
1616
from openml.testing import TestBase
1717

@@ -20,7 +20,7 @@
2020
_get_dataset_description,
2121
_get_dataset_arff,
2222
_get_dataset_features,
23-
_get_dataset_qualities)
23+
_get_dataset_qualities, get_dataset)
2424

2525

2626
class TestOpenMLDataset(TestBase):
@@ -148,6 +148,9 @@ def test_get_dataset(self):
148148
self.assertTrue(os.path.exists(os.path.join(
149149
openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
150150

151+
def test_get_dataset_with_string(self):
152+
self.assertRaises(PyOpenMLError, get_dataset, '373')
153+
151154
def test_get_dataset_sparse(self):
152155
dataset = openml.datasets.get_dataset(1571)
153156
X = dataset.get_data()

tests/tasks/test_task_functions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,19 @@ def _check_task(self, task):
5050
['in_preparation', 'active', 'deactivated'])
5151

5252
def test_list_tasks_by_type(self):
53+
num_curves_tasks = 200 # number is flexible, check server if fails
5354
ttid=3
5455
tasks = openml.tasks.list_tasks(task_type_id=ttid)
55-
self.assertGreaterEqual(len(tasks), 300)
56+
self.assertGreaterEqual(len(tasks), num_curves_tasks)
5657
for tid in tasks:
5758
print(tasks[tid])
5859
self.assertEquals(ttid, tasks[tid]["ttid"])
5960
self._check_task(tasks[tid])
6061

6162
def test_list_tasks_by_tag(self):
63+
num_basic_tasks = 54 # number is flexible, check server if fails
6264
tasks = openml.tasks.list_tasks(tag='basic')
63-
self.assertGreaterEqual(len(tasks), 57)
65+
self.assertGreaterEqual(len(tasks), num_basic_tasks)
6466
for tid in tasks:
6567
self._check_task(tasks[tid])
6668

0 commit comments

Comments
 (0)