Skip to content

Commit 3e99d99

Browse files
ArlindKadramfeurer
authored andcommitted
Feature #369 (#424)
* Implementing dataset listing with more filters and adding unit tests * Add to function documentation
1 parent deb769e commit 3e99d99

2 files changed

Lines changed: 40 additions & 13 deletions

File tree

openml/datasets/functions.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def _get_cached_dataset_arff(dataset_id):
137137
"cached" % dataset_id)
138138

139139

140-
def list_datasets(offset=None, size=None, tag=None, status=None):
140+
def list_datasets(offset=None, size=None, status=None, **kwargs):
141141
"""Return a list of all dataset which are on OpenML.
142142
143143
Parameters
@@ -146,12 +146,13 @@ def list_datasets(offset=None, size=None, tag=None, status=None):
146146
The number of datasets to skip, starting from the first.
147147
size : int, optional
148148
The maximum number of datasets to show.
149-
tag : str, optional
150-
Only include datasets matching this tag.
151149
status : str, optional
152150
Should be {active, in_preparation, deactivated}. By
153151
default active datasets are returned, but also datasets
154-
from another status can be requested.
152+
from another status can be requested.
153+
kwargs : dict, optional
154+
Legal filter operators (keys in the dict):
155+
{tag, status, limit, offset, data_name, data_version, number_instances, number_features, number_classes, number_missing_values}.
155156
156157
Returns
157158
-------
@@ -175,12 +176,13 @@ def list_datasets(offset=None, size=None, tag=None, status=None):
175176
if size is not None:
176177
api_call += "/limit/%d" % int(size)
177178

178-
if tag is not None:
179-
api_call += "/tag/%s" % tag
180-
181179
if status is not None:
182180
api_call += "/status/%s" %status
183181

182+
if kwargs is not None:
183+
for filter, value in kwargs.items():
184+
api_call += "/%s/%s" % (filter, value)
185+
184186
return _list_datasets(api_call)
185187

186188

tests/test_datasets/test_dataset_functions.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ def _check_dataset(self, dataset):
115115
self.assertIsInstance(dataset['status'], six.string_types)
116116
self.assertIn(dataset['status'], ['in_preparation', 'active',
117117
'deactivated'])
118+
def _check_datasets(self, datasets):
119+
for did in datasets:
120+
self._check_dataset(datasets[did])
118121

119122
def test_tag_untag_dataset(self):
120123
tag = 'test_tag_%d' %random.randint(1, 1000000)
@@ -129,23 +132,45 @@ def test_list_datasets(self):
129132
datasets = openml.datasets.list_datasets()
130133
# 1087 as the number of datasets on openml.org
131134
self.assertGreaterEqual(len(datasets), 100)
132-
for did in datasets:
133-
self._check_dataset(datasets[did])
135+
self._check_datasets(datasets)
134136

135137
def test_list_datasets_by_tag(self):
136138
datasets = openml.datasets.list_datasets(tag='study_14')
137139
self.assertGreaterEqual(len(datasets), 100)
138-
for did in datasets:
139-
self._check_dataset(datasets[did])
140+
self._check_datasets(datasets)
141+
142+
def test_list_datasets_by_number_instances(self):
143+
datasets = openml.datasets.list_datasets(number_instances="5..100")
144+
self.assertGreaterEqual(len(datasets), 4)
145+
self._check_datasets(datasets)
146+
147+
def test_list_datasets_by_number_features(self):
148+
datasets = openml.datasets.list_datasets(number_features="50..100")
149+
self.assertGreaterEqual(len(datasets), 8)
150+
self._check_datasets(datasets)
151+
152+
def test_list_datasets_by_number_classes(self):
153+
datasets = openml.datasets.list_datasets(number_classes="5")
154+
self.assertGreaterEqual(len(datasets), 3)
155+
self._check_datasets(datasets)
156+
157+
def test_list_datasets_by_number_missing_values(self):
158+
datasets = openml.datasets.list_datasets(number_missing_values="5..100")
159+
self.assertGreaterEqual(len(datasets), 5)
160+
self._check_datasets(datasets)
161+
162+
def test_list_datasets_combined_filters(self):
163+
datasets = openml.datasets.list_datasets(tag='study_14', number_instances="100..1000", number_missing_values="800..1000")
164+
self.assertGreaterEqual(len(datasets), 1)
165+
self._check_datasets(datasets)
140166

141167
def test_list_datasets_paginate(self):
142168
size = 10
143169
max = 100
144170
for i in range(0, max, size):
145171
datasets = openml.datasets.list_datasets(offset=i, size=size)
146172
self.assertGreaterEqual(size, len(datasets))
147-
for did in datasets:
148-
self._check_dataset(datasets[did])
173+
self._check_datasets(datasets)
149174

150175
def test_list_datasets_empty(self):
151176
datasets = openml.datasets.list_datasets(tag='NoOneWouldUseThisTagAnyway')

0 commit comments

Comments
 (0)