Skip to content

Commit 53e8331

Browse files
authored
Merge pull request #160 from openml/paginationsupport
dataset and task pagination, issue #142
2 parents 1e89d47 + a8c2ded commit 53e8331

6 files changed

Lines changed: 105 additions & 69 deletions

File tree

openml/datasets/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
from .functions import (list_datasets, list_datasets_by_tag,
2-
check_datasets_active, get_datasets, get_dataset)
1+
from .functions import (list_datasets, check_datasets_active,
2+
get_datasets, get_dataset)
33
from .dataset import OpenMLDataset
44

55
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
6-
'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
7-
'list_datasets']
6+
'OpenMLDataset', 'list_datasets']

openml/datasets/functions.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def _list_cached_datasets():
4242
dataset_directory_content = os.listdir(directory_name)
4343

4444
if "dataset.arff" in dataset_directory_content and \
45-
"description.xml" in dataset_directory_content:
45+
"description.xml" in dataset_directory_content:
4646
if dataset_id not in datasets:
4747
datasets.append(dataset_id)
4848

@@ -111,13 +111,22 @@ def _get_cached_dataset_arff(dataset_id):
111111
"cached" % dataset_id)
112112

113113

114-
def list_datasets():
114+
def list_datasets(offset=None, size=None, tag=None):
115115
"""Return a list of all dataset which are on OpenML.
116116
117+
Parameters
118+
----------
119+
offset : int, optional
120+
the number of datasets to skip, starting from the first
121+
size : int, optional
122+
the maximum datasets of tasks to show
123+
tag : str, optional
124+
the tag to include
125+
117126
Returns
118127
-------
119128
datasets : list of dicts
120-
A list of all datasets.
129+
A list of datasets having the given tag (if applicable).
121130
122131
Every dataset is represented by a dictionary containing
123132
the following information:
@@ -127,22 +136,17 @@ def list_datasets():
127136
If qualities are calculated for the dataset, some of
128137
these are also returned.
129138
"""
130-
return _list_datasets("data/list")
131-
139+
api_call = "data/list"
140+
if offset is not None:
141+
api_call += "/offset/%d" % int(offset)
132142

133-
def list_datasets_by_tag(tag):
134-
"""Return all datasets having the given tag.
143+
if size is not None:
144+
api_call += "/limit/%d" % int(size)
135145

136-
Returns
137-
-------
138-
datasets : list of dicts
139-
A list of all datasets having the given tag. Every dataset is
140-
represented by a dictionary containing the following information:
141-
dataset id, and status. If qualities are calculated for the dataset,
142-
some of these are also returned.
146+
if tag is not None:
147+
api_call += "/tag/%s" % tag
143148

144-
"""
145-
return _list_datasets("data/list/%s" % tag)
149+
return _list_datasets(api_call)
146150

147151

148152
def _list_datasets(api_call):
@@ -154,7 +158,7 @@ def _list_datasets(api_call):
154158
assert type(datasets_dict['oml:data']['oml:dataset']) == list, \
155159
type(datasets_dict['oml:data'])
156160
assert datasets_dict['oml:data']['@xmlns:oml'] == \
157-
'http://openml.org/openml'
161+
'http://openml.org/openml'
158162

159163
datasets = []
160164
for dataset_ in datasets_dict['oml:data']['oml:dataset']:

openml/tasks/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from .task import OpenMLTask
22
from .split import OpenMLSplit
3-
from .functions import (get_task, list_tasks, list_tasks_by_type,
4-
list_tasks_by_tag)
3+
from .functions import (get_task, list_tasks)
54

6-
__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'list_tasks_by_type',
7-
'list_tasks_by_tag', 'OpenMLSplit']
5+
__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'OpenMLSplit']

openml/tasks/functions.py

Lines changed: 24 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -89,61 +89,44 @@ def _get_estimation_procedure_list():
8989
return procs
9090

9191

92-
def list_tasks_by_type(task_type_id):
93-
"""Return a list of all tasks for a given tasks type which are on OpenML.
92+
def list_tasks(task_type_id=None, offset=None, size=None, tag=None):
93+
"""Return a number of tasks having the given tag and task_type_id
9494
9595
Parameters
9696
----------
97-
task_type_id : int
97+
task_type_id : int, optional
9898
ID of the task type as detailed
9999
`here <http://www.openml.org/search?type=task_type>`_.
100+
offset : int, optional
101+
the number of tasks to skip, starting from the first
102+
size : int, optional
103+
the maximum number of tasks to show
104+
tag : str, optional
105+
the tag to include
100106
101107
Returns
102108
-------
103109
list
104-
A list of all tasks of the given task type. Every task is represented by
105-
a dictionary containing the following information: task id,
106-
dataset id, task_type and status. If qualities are calculated for
107-
the associated dataset, some of these are also returned.
110+
A list of all tasks having the given task_type_id and the give tag.
111+
Every task is represented by a dictionary containing the following
112+
information: task id, dataset id, task_type and status. If qualities
113+
are calculated for the associated dataset, some of these are also
114+
returned.
108115
"""
109-
try:
110-
task_type_id = int(task_type_id)
111-
except:
112-
raise ValueError("Task Type ID is neither an Integer nor can be "
113-
"cast to an Integer.")
114-
return _list_tasks("task/list/type/%d" % task_type_id)
115-
116-
117-
def list_tasks_by_tag(tag):
118-
"""Return all tasks having the given tag
119-
120-
Parameters
121-
----------
122-
tag : str
116+
api_call = "task/list"
117+
if task_type_id is not None:
118+
api_call += "/task_type_id/%d" % int(task_type_id)
123119

124-
Returns
125-
-------
126-
list
127-
A list of all tasks having a give tag. Every task is represented by
128-
a dictionary containing the following information: task id,
129-
dataset id, task_type and status. If qualities are calculated for
130-
the associated dataset, some of these are also returned.
131-
"""
132-
return _list_tasks("task/list/tag/%s" % tag)
120+
if offset is not None:
121+
api_call += "/offset/%d" % int(offset)
133122

123+
if size is not None:
124+
api_call += "/limit/%d" % int(size)
134125

135-
def list_tasks():
136-
"""Return a list of all tasks which are on OpenML.
126+
if tag is not None:
127+
api_call += "/tag/%s" % tag
137128

138-
Returns
139-
-------
140-
list
141-
A list of all tasks. Every task is represented by a
142-
dictionary containing the following information: task id,
143-
dataset id, task_type and status. If qualities are calculated for
144-
the associated dataset, some of these are also returned.
145-
"""
146-
return _list_tasks('task/list')
129+
return _list_tasks(api_call)
147130

148131

149132
def _list_tasks(api_call):

tests/datasets/test_datasets.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_list_datasets(self):
8888
'deactivated'])
8989

9090
def test_list_datasets_by_tag(self):
91-
datasets = openml.datasets.list_datasets_by_tag('uci')
91+
datasets = openml.datasets.list_datasets(tag='uci')
9292
self.assertGreaterEqual(len(datasets), 5)
9393
for dataset in datasets:
9494
self.assertEqual(type(dataset), dict)
@@ -100,6 +100,22 @@ def test_list_datasets_by_tag(self):
100100
self.assertIn(dataset['status'], ['in_preparation', 'active',
101101
'deactivated'])
102102

103+
def test_list_datasets_paginate(self):
104+
size = 10
105+
max = 100
106+
for i in range(0, max, size):
107+
data = openml.datasets.list_datasets(offset=i, size=size)
108+
self.assertGreaterEqual(size, len(data))
109+
for dataset in data:
110+
self.assertEqual(type(dataset), dict)
111+
self.assertGreaterEqual(len(dataset), 2)
112+
self.assertIn('did', dataset)
113+
self.assertIsInstance(dataset['did'], int)
114+
self.assertIn('status', dataset)
115+
self.assertTrue(is_string(dataset['status']))
116+
self.assertIn(dataset['status'], ['in_preparation',
117+
'active', 'deactivated'])
118+
103119
@unittest.skip('See https://github.com/openml/openml-python/issues/149')
104120
def test_check_datasets_active(self):
105121
active = openml.datasets.check_datasets_active([1, 17])

tests/tasks/test_task_functions.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ def _check_task(self, task):
5151
['in_preparation', 'active', 'deactivated'])
5252

5353
def test_list_tasks_by_type(self):
54-
tasks = openml.tasks.list_tasks_by_type(task_type_id=3)
54+
tasks = openml.tasks.list_tasks(task_type_id=3)
5555
self.assertGreaterEqual(len(tasks), 300)
5656
for task in tasks:
5757
self._check_task(task)
5858

5959
def test_list_tasks_by_tag(self):
60-
tasks = openml.tasks.list_tasks_by_tag('basic')
60+
tasks = openml.tasks.list_tasks(tag='basic')
6161
self.assertGreaterEqual(len(tasks), 57)
6262
for task in tasks:
6363
self._check_task(task)
@@ -68,6 +68,42 @@ def test_list_tasks(self):
6868
for task in tasks:
6969
self._check_task(task)
7070

71+
def test_list_tasks_paginate(self):
72+
size = 10
73+
max = 100
74+
for i in range(0, max, size):
75+
tasks = openml.tasks.list_tasks(offset=i, size=size)
76+
self.assertGreaterEqual(size, len(tasks))
77+
for task in tasks:
78+
self.assertEqual(type(task), dict)
79+
self.assertGreaterEqual(len(task), 4)
80+
self.assertIn('tid', task)
81+
self.assertIsInstance(task['tid'], int)
82+
self.assertIn('did', task)
83+
self.assertIsInstance(task['did'], int)
84+
self.assertIn('status', task)
85+
self.assertTrue(is_string(task['status']))
86+
self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated'])
87+
88+
def test_list_tasks_per_type_paginate(self):
89+
size = 10
90+
max = 100
91+
task_types = 5
92+
for j in range(1,task_types):
93+
for i in range(0, max, size):
94+
tasks = openml.tasks.list_tasks(task_type_id=j, offset=i, size=size)
95+
self.assertGreaterEqual(size, len(tasks))
96+
for task in tasks:
97+
self.assertEqual(type(task), dict)
98+
self.assertGreaterEqual(len(task), 4)
99+
self.assertIn('tid', task)
100+
self.assertIsInstance(task['tid'], int)
101+
self.assertIn('did', task)
102+
self.assertIsInstance(task['did'], int)
103+
self.assertIn('status', task)
104+
self.assertTrue(is_string(task['status']))
105+
self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated'])
106+
71107
def test__get_task(self):
72108
openml.config.set_cache_directory(self.static_cache_dir)
73109
task = openml.tasks.get_task(1882)

0 commit comments

Comments
 (0)