Skip to content

Commit 62e3f1b

Browse files
authored
Merge pull request #165 from openml/datasetdict
changed dataset list to dict
2 parents fd37ecc + 7d056bf commit 62e3f1b

2 files changed

Lines changed: 20 additions & 32 deletions

File tree

openml/datasets/functions.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,10 @@ def _list_datasets(api_call):
160160
assert datasets_dict['oml:data']['@xmlns:oml'] == \
161161
'http://openml.org/openml'
162162

163-
datasets = []
163+
datasets = dict()
164164
for dataset_ in datasets_dict['oml:data']['oml:dataset']:
165-
dataset = {'did': int(dataset_['oml:did']),
165+
did = int(dataset_['oml:did'])
166+
dataset = {'did': did,
166167
'name': dataset_['oml:name'],
167168
'format': dataset_['oml:format'],
168169
'status': dataset_['oml:status']}
@@ -173,9 +174,7 @@ def _list_datasets(api_call):
173174
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
174175
quality['#text'] = int(quality['#text'])
175176
dataset[quality['@name']] = quality['#text']
176-
177-
datasets.append(dataset)
178-
datasets.sort(key=lambda t: t['did'])
177+
datasets[did] = dataset
179178

180179
return datasets
181180

tests/datasets/test_datasets.py

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,7 @@ def test_get_cached_dataset_arff_not_cached(self):
7171
openml.datasets.functions._get_cached_dataset_arff,
7272
3)
7373

74-
def test_list_datasets(self):
75-
# We can only perform a smoke test here because we test on dynamic
76-
# data from the internet...
77-
datasets = openml.datasets.list_datasets()
78-
# 1087 as the number of datasets on openml.org
79-
self.assertGreaterEqual(len(datasets), 1087)
80-
for dataset in datasets:
74+
def _check_dataset(self, dataset):
8175
self.assertEqual(type(dataset), dict)
8276
self.assertGreaterEqual(len(dataset), 2)
8377
self.assertIn('did', dataset)
@@ -87,34 +81,29 @@ def test_list_datasets(self):
8781
self.assertIn(dataset['status'], ['in_preparation', 'active',
8882
'deactivated'])
8983

84+
def test_list_datasets(self):
85+
# We can only perform a smoke test here because we test on dynamic
86+
# data from the internet...
87+
datasets = openml.datasets.list_datasets()
88+
# 1087 as the number of datasets on openml.org
89+
self.assertGreaterEqual(len(datasets), 1087)
90+
for did in datasets:
91+
self._check_dataset(datasets[did])
92+
9093
def test_list_datasets_by_tag(self):
9194
datasets = openml.datasets.list_datasets(tag='uci')
9295
self.assertGreaterEqual(len(datasets), 5)
93-
for dataset in datasets:
94-
self.assertEqual(type(dataset), dict)
95-
self.assertGreaterEqual(len(dataset), 2)
96-
self.assertIn('did', dataset)
97-
self.assertIsInstance(dataset['did'], int)
98-
self.assertIn('status', dataset)
99-
self.assertTrue(is_string(dataset['status']))
100-
self.assertIn(dataset['status'], ['in_preparation', 'active',
101-
'deactivated'])
96+
for did in datasets:
97+
self._check_dataset(datasets[did])
10298

10399
def test_list_datasets_paginate(self):
104100
size = 10
105101
max = 100
106102
for i in range(0, max, size):
107-
data = openml.datasets.list_datasets(offset=i, size=size)
108-
self.assertGreaterEqual(size, len(data))
109-
for dataset in data:
110-
self.assertEqual(type(dataset), dict)
111-
self.assertGreaterEqual(len(dataset), 2)
112-
self.assertIn('did', dataset)
113-
self.assertIsInstance(dataset['did'], int)
114-
self.assertIn('status', dataset)
115-
self.assertTrue(is_string(dataset['status']))
116-
self.assertIn(dataset['status'], ['in_preparation',
117-
'active', 'deactivated'])
103+
datasets = openml.datasets.list_datasets(offset=i, size=size)
104+
self.assertGreaterEqual(size, len(datasets))
105+
for did in datasets:
106+
self._check_dataset(datasets[did])
118107

119108
@unittest.skip('See https://github.com/openml/openml-python/issues/149')
120109
def test_check_datasets_active(self):

0 commit comments

Comments
 (0)