Skip to content

Commit 81b7e84

Browse files
author
janvanrijn
committed
dataset and task pagination, issue #142
1 parent c20271f commit 81b7e84

6 files changed

Lines changed: 165 additions & 4 deletions

File tree

openml/datasets/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from .functions import (list_datasets, list_datasets_by_tag,
2-
check_datasets_active, get_datasets, get_dataset)
2+
check_datasets_active, get_datasets, get_dataset,
3+
list_datasets_paginate)
34
from .dataset import OpenMLDataset
45

56
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
67
'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
7-
'list_datasets']
8+
'list_datasets_paginate']

openml/datasets/functions.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,42 @@ def list_datasets():
130130
return _list_datasets("data/list")
131131

132132

133+
def list_datasets_paginate(offset,size):
134+
"""Return a partial list (of given size) dataset which are on OpenML, starting with offset.
135+
136+
Parameters
137+
----------
138+
offset : int
139+
the number of datasets to skip, starting from the first
140+
size : int
141+
the maximum datasets of tasks to show
142+
143+
Returns
144+
-------
145+
datasets : list of dicts
146+
A partial list of datasets.
147+
148+
Every dataset is represented by a dictionary containing
149+
the following information:
150+
- dataset id
151+
- status
152+
153+
If qualities are calculated for the dataset, some of
154+
these are also returned.
155+
"""
156+
try:
157+
offset = int(offset)
158+
except:
159+
raise ValueError("Offset is neither an Integer nor can be "
160+
"cast to an Integer.")
161+
try:
162+
limit = int(size)
163+
except:
164+
raise ValueError("Size is neither an Integer nor can be "
165+
"cast to an Integer.")
166+
return _list_datasets("data/list/offset/%d/limit/%d" % (offset, size))
167+
168+
133169
def list_datasets_by_tag(tag):
134170
"""Return all datasets having the given tag.
135171

openml/tasks/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from .task import OpenMLTask
22
from .split import OpenMLSplit
33
from .functions import (get_task, list_tasks, list_tasks_by_type,
4-
list_tasks_by_tag)
4+
list_tasks_by_tag, list_tasks_paginate,
5+
list_tasks_by_type_paginate)
56

67
__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'list_tasks_by_type',
7-
'list_tasks_by_tag', 'OpenMLSplit']
8+
'list_tasks_by_tag', 'list_tasks_paginate', 'OpenMLSplit',
9+
'list_tasks_by_type_paginate']

openml/tasks/functions.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,76 @@ def list_tasks_by_type(task_type_id):
114114
return _list_tasks("task/list/type/%d" % task_type_id)
115115

116116

117+
def list_tasks_paginate(offset,size):
118+
"""Return a partial list (of given size) tasks for a given tasks type, starting with offset.
119+
120+
Parameters
121+
----------
122+
offset : int
123+
the number of tasks to skip, starting from the first
124+
size : int
125+
the maximum number of tasks to show
126+
127+
Returns
128+
-------
129+
list
130+
A partial list of tasks of the task type. Every task is represented by a
131+
dictionary containing the following information: task id,
132+
dataset id, task_type and status. If qualities are calculated for
133+
the associated dataset, some of these are also returned.
134+
"""
135+
try:
136+
offset = int(offset)
137+
except:
138+
raise ValueError("Offset is neither an Integer nor can be "
139+
"cast to an Integer.")
140+
try:
141+
size = int(size)
142+
except:
143+
raise ValueError("Size is neither an Integer nor can be "
144+
"cast to an Integer.")
145+
return _list_tasks("task/list/offset/%d/limit/%d" % (offset, size))
146+
147+
148+
def list_tasks_by_type_paginate(task_type_id,offset,size):
149+
"""Return a partial list (of given size) tasks, starting with offset.
150+
151+
Parameters
152+
----------
153+
task_type_id : int
154+
ID of the task type as detailed
155+
`here <http://www.openml.org/search?type=task_type>`_.
156+
offset : int
157+
the number of tasks to skip, starting from the first
158+
size : int
159+
the maximum number of tasks to show
160+
161+
Returns
162+
-------
163+
list
164+
A partial list of tasks. Every task is represented by a
165+
dictionary containing the following information: task id,
166+
dataset id, task_type and status. If qualities are calculated for
167+
the associated dataset, some of these are also returned.
168+
"""
169+
try:
170+
task_type_id = int(task_type_id)
171+
except:
172+
raise ValueError("Task Type ID is neither an Integer nor can be "
173+
"cast to an Integer.")
174+
try:
175+
offset = int(offset)
176+
except:
177+
raise ValueError("Offset is neither an Integer nor can be "
178+
"cast to an Integer.")
179+
try:
180+
size = int(size)
181+
except:
182+
raise ValueError("Size is neither an Integer nor can be "
183+
"cast to an Integer.")
184+
return _list_tasks("task/list/type/%d/offset/%d/limit/%d" % (task_type_id,offset, size))
185+
186+
117187
def list_tasks_by_tag(tag):
118188
"""Return all tasks having the given tag
119189

tests/datasets/test_datasets.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,22 @@ def test_list_datasets_by_tag(self):
100100
self.assertIn(dataset['status'], ['in_preparation', 'active',
101101
'deactivated'])
102102

103+
def test_list_datasets_paginate(self):
104+
size = 10
105+
max = 100
106+
for i in range(0, max, size):
107+
data = openml.datasets.list_datasets_paginate(i, size)
108+
self.assertGreaterEqual(size, len(data))
109+
for dataset in data:
110+
self.assertEqual(type(dataset), dict)
111+
self.assertGreaterEqual(len(dataset), 2)
112+
self.assertIn('did', dataset)
113+
self.assertIsInstance(dataset['did'], int)
114+
self.assertIn('status', dataset)
115+
self.assertTrue(is_string(dataset['status']))
116+
self.assertIn(dataset['status'], ['in_preparation',
117+
'active', 'deactivated'])
118+
103119
@unittest.skip('See https://github.com/openml/openml-python/issues/149')
104120
def test_check_datasets_active(self):
105121
active = openml.datasets.check_datasets_active([1, 17])

tests/tasks/test_task_functions.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,42 @@ def test_list_tasks(self):
6868
for task in tasks:
6969
self._check_task(task)
7070

71+
def test_list_tasks_paginate(self):
72+
size = 10
73+
max = 100
74+
for i in range(0, max, size):
75+
tasks = openml.tasks.list_tasks_paginate(i, size)
76+
self.assertGreaterEqual(size, len(tasks))
77+
for task in tasks:
78+
self.assertEqual(type(task), dict)
79+
self.assertGreaterEqual(len(task), 4)
80+
self.assertIn('tid', task)
81+
self.assertIsInstance(task['tid'], int)
82+
self.assertIn('did', task)
83+
self.assertIsInstance(task['did'], int)
84+
self.assertIn('status', task)
85+
self.assertTrue(is_string(task['status']))
86+
self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated'])
87+
88+
def test_list_tasks_per_type_paginate(self):
89+
size = 10
90+
max = 100
91+
task_types = 5
92+
for j in range(1,task_types):
93+
for i in range(0, max, size):
94+
tasks = openml.tasks.list_tasks_by_type_paginate(j, i, size)
95+
self.assertGreaterEqual(size, len(tasks))
96+
for task in tasks:
97+
self.assertEqual(type(task), dict)
98+
self.assertGreaterEqual(len(task), 4)
99+
self.assertIn('tid', task)
100+
self.assertIsInstance(task['tid'], int)
101+
self.assertIn('did', task)
102+
self.assertIsInstance(task['did'], int)
103+
self.assertIn('status', task)
104+
self.assertTrue(is_string(task['status']))
105+
self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated'])
106+
71107
def test__get_task(self):
72108
openml.config.set_cache_directory(self.static_cache_dir)
73109
task = openml.tasks.get_task(1882)

0 commit comments

Comments
 (0)