Skip to content

Commit 6a1660b

Browse files
committed
Merge branch 'develop' into feature/upload-flow
2 parents beabca0 + a407b75 commit 6a1660b

12 files changed

Lines changed: 182 additions & 166 deletions

File tree

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Changelog
1414

1515
* Add this changelog (Matthias Feurer)
1616
* 2nd example notebook PyOpenML.ipynb (Joaquin Vanschoren)
17+
* Pagination support for list datasets and list tasks
1718

1819
API calls
1920
=========

examples/OpenMLDemo.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@
136136
"source": [
137137
"datasets = openml.datasets.list_datasets()\n",
138138
"\n",
139-
"data = pd.DataFrame(datasets)\n",
139+
"data = pd.DataFrame(datasets).transpose()\n",
140140
"print(\"First 10 of %s datasets...\" % len(datasets))\n",
141141
"print(data[:10][['did','name','NumberOfInstances','NumberOfFeatures']])"
142142
]
@@ -569,7 +569,7 @@
569569
"source": [
570570
"task_list = openml.tasks.list_tasks()\n",
571571
"\n",
572-
"tasks = pd.DataFrame(task_list)\n",
572+
"tasks = pd.DataFrame(task_list).transpose()\n",
573573
"print(\"First 5 of %s tasks:\" % len(tasks))\n",
574574
"print(tasks[:5][['tid','did','name','task_type','estimation_procedure']])"
575575
]
@@ -688,14 +688,14 @@
688688
"language_info": {
689689
"codemirror_mode": {
690690
"name": "ipython",
691-
"version": 3
691+
"version": 3.0
692692
},
693693
"file_extension": ".py",
694694
"mimetype": "text/x-python",
695695
"name": "python",
696696
"nbconvert_exporter": "python",
697697
"pygments_lexer": "ipython3",
698-
"version": "3.5.1"
698+
"version": "3.4.3"
699699
}
700700
},
701701
"nbformat": 4,

openml/datasets/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
from .functions import (list_datasets, list_datasets_by_tag,
2-
check_datasets_active, get_datasets, get_dataset)
1+
from .functions import (list_datasets, check_datasets_active,
2+
get_datasets, get_dataset)
33
from .dataset import OpenMLDataset
44

55
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
6-
'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
7-
'list_datasets']
6+
'OpenMLDataset', 'list_datasets']

openml/datasets/functions.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def _list_cached_datasets():
4242
dataset_directory_content = os.listdir(directory_name)
4343

4444
if "dataset.arff" in dataset_directory_content and \
45-
"description.xml" in dataset_directory_content:
45+
"description.xml" in dataset_directory_content:
4646
if dataset_id not in datasets:
4747
datasets.append(dataset_id)
4848

@@ -111,13 +111,22 @@ def _get_cached_dataset_arff(dataset_id):
111111
"cached" % dataset_id)
112112

113113

114-
def list_datasets():
114+
def list_datasets(offset=None, size=None, tag=None):
115115
"""Return a list of all dataset which are on OpenML.
116116
117+
Parameters
118+
----------
119+
offset : int, optional
120+
the number of datasets to skip, starting from the first
121+
size : int, optional
122+
the maximum datasets of tasks to show
123+
tag : str, optional
124+
the tag to include
125+
117126
Returns
118127
-------
119128
datasets : list of dicts
120-
A list of all datasets.
129+
A list of datasets having the given tag (if applicable).
121130
122131
Every dataset is represented by a dictionary containing
123132
the following information:
@@ -127,22 +136,17 @@ def list_datasets():
127136
If qualities are calculated for the dataset, some of
128137
these are also returned.
129138
"""
130-
return _list_datasets("data/list")
139+
api_call = "data/list"
140+
if offset is not None:
141+
api_call += "/offset/%d" % int(offset)
131142

143+
if size is not None:
144+
api_call += "/limit/%d" % int(size)
132145

133-
def list_datasets_by_tag(tag):
134-
"""Return all datasets having the given tag.
135-
136-
Returns
137-
-------
138-
datasets : list of dicts
139-
A list of all datasets having the given tag. Every dataset is
140-
represented by a dictionary containing the following information:
141-
dataset id, and status. If qualities are calculated for the dataset,
142-
some of these are also returned.
146+
if tag is not None:
147+
api_call += "/tag/%s" % tag
143148

144-
"""
145-
return _list_datasets("data/list/%s" % tag)
149+
return _list_datasets(api_call)
146150

147151

148152
def _list_datasets(api_call):
@@ -154,11 +158,12 @@ def _list_datasets(api_call):
154158
assert type(datasets_dict['oml:data']['oml:dataset']) == list, \
155159
type(datasets_dict['oml:data'])
156160
assert datasets_dict['oml:data']['@xmlns:oml'] == \
157-
'http://openml.org/openml'
161+
'http://openml.org/openml'
158162

159-
datasets = []
163+
datasets = dict()
160164
for dataset_ in datasets_dict['oml:data']['oml:dataset']:
161-
dataset = {'did': int(dataset_['oml:did']),
165+
did = int(dataset_['oml:did'])
166+
dataset = {'did': did,
162167
'name': dataset_['oml:name'],
163168
'format': dataset_['oml:format'],
164169
'status': dataset_['oml:status']}
@@ -169,9 +174,7 @@ def _list_datasets(api_call):
169174
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
170175
quality['#text'] = int(quality['#text'])
171176
dataset[quality['@name']] = quality['#text']
172-
173-
datasets.append(dataset)
174-
datasets.sort(key=lambda t: t['did'])
177+
datasets[did] = dataset
175178

176179
return datasets
177180

openml/tasks/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from .task import OpenMLTask
22
from .split import OpenMLSplit
3-
from .functions import (get_task, list_tasks, list_tasks_by_type,
4-
list_tasks_by_tag)
3+
from .functions import (get_task, list_tasks)
54

6-
__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'list_tasks_by_type',
7-
'list_tasks_by_tag', 'OpenMLSplit']
5+
__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'OpenMLSplit']

openml/tasks/functions.py

Lines changed: 31 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -89,61 +89,44 @@ def _get_estimation_procedure_list():
8989
return procs
9090

9191

92-
def list_tasks_by_type(task_type_id):
93-
"""Return a list of all tasks for a given tasks type which are on OpenML.
92+
def list_tasks(task_type_id=None, offset=None, size=None, tag=None):
93+
"""Return a number of tasks having the given tag and task_type_id
9494
9595
Parameters
9696
----------
97-
task_type_id : int
97+
task_type_id : int, optional
9898
ID of the task type as detailed
9999
`here <http://www.openml.org/search?type=task_type>`_.
100+
offset : int, optional
101+
the number of tasks to skip, starting from the first
102+
size : int, optional
103+
the maximum number of tasks to show
104+
tag : str, optional
105+
the tag to include
100106
101107
Returns
102108
-------
103109
list
104-
A list of all tasks of the given task type. Every task is represented by
105-
a dictionary containing the following information: task id,
106-
dataset id, task_type and status. If qualities are calculated for
107-
the associated dataset, some of these are also returned.
110+
A list of all tasks having the given task_type_id and the give tag.
111+
Every task is represented by a dictionary containing the following
112+
information: task id, dataset id, task_type and status. If qualities
113+
are calculated for the associated dataset, some of these are also
114+
returned.
108115
"""
109-
try:
110-
task_type_id = int(task_type_id)
111-
except:
112-
raise ValueError("Task Type ID is neither an Integer nor can be "
113-
"cast to an Integer.")
114-
return _list_tasks("task/list/type/%d" % task_type_id)
115-
116-
117-
def list_tasks_by_tag(tag):
118-
"""Return all tasks having the given tag
116+
api_call = "task/list"
117+
if task_type_id is not None:
118+
api_call += "/type/%d" % int(task_type_id)
119119

120-
Parameters
121-
----------
122-
tag : str
123-
124-
Returns
125-
-------
126-
list
127-
A list of all tasks having a give tag. Every task is represented by
128-
a dictionary containing the following information: task id,
129-
dataset id, task_type and status. If qualities are calculated for
130-
the associated dataset, some of these are also returned.
131-
"""
132-
return _list_tasks("task/list/tag/%s" % tag)
120+
if offset is not None:
121+
api_call += "/offset/%d" % int(offset)
133122

123+
if size is not None:
124+
api_call += "/limit/%d" % int(size)
134125

135-
def list_tasks():
136-
"""Return a list of all tasks which are on OpenML.
126+
if tag is not None:
127+
api_call += "/tag/%s" % tag
137128

138-
Returns
139-
-------
140-
list
141-
A list of all tasks. Every task is represented by a
142-
dictionary containing the following information: task id,
143-
dataset id, task_type and status. If qualities are calculated for
144-
the associated dataset, some of these are also returned.
145-
"""
146-
return _list_tasks('task/list')
129+
return _list_tasks(api_call)
147130

148131

149132
def _list_tasks(api_call):
@@ -162,12 +145,15 @@ def _list_tasks(api_call):
162145
'"oml:runs"/@xmlns:oml is not '
163146
'"http://openml.org/openml": %s'
164147
% str(tasks_dict))
148+
165149
try:
166-
tasks = []
150+
tasks = dict();
167151
procs = _get_estimation_procedure_list()
168152
proc_dict = dict((x['id'], x) for x in procs)
169153
for task_ in tasks_dict['oml:tasks']['oml:task']:
170-
task = {'tid': int(task_['oml:task_id']),
154+
tid = int(task_['oml:task_id'])
155+
task = {'tid': tid,
156+
'ttid': int(task_['oml:task_type_id']),
171157
'did': int(task_['oml:did']),
172158
'name': task_['oml:name'],
173159
'task_type': task_['oml:task_type'],
@@ -187,12 +173,10 @@ def _list_tasks(api_call):
187173
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
188174
quality['#text'] = int(quality['#text'])
189175
task[quality['@name']] = quality['#text']
190-
tasks.append(task)
176+
tasks[tid] = task
191177
except KeyError as e:
192178
raise KeyError("Invalid xml for task: %s" % e)
193179

194-
tasks.sort(key=lambda t: t['tid'])
195-
196180
return tasks
197181

198182

@@ -262,7 +246,7 @@ def _create_task_from_xml(xml):
262246
estimation_parameters[name] = text
263247

264248
return OpenMLTask(
265-
dic["oml:task_id"], dic["oml:task_type"],
249+
dic["oml:task_id"], dic['oml:task_type_id'], dic["oml:task_type"],
266250
inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
267251
inputs["source_data"]["oml:data_set"]["oml:target_feature"],
268252
inputs["estimation_procedure"]["oml:estimation_procedure"][

openml/tasks/task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010

1111
class OpenMLTask(object):
12-
def __init__(self, task_id, task_type, data_set_id, target_name,
13-
estimation_procedure_type, data_splits_url,
12+
def __init__(self, task_id, task_type_id, task_type, data_set_id,
13+
target_name, estimation_procedure_type, data_splits_url,
1414
estimation_parameters, evaluation_measure, cost_matrix,
1515
class_labels=None):
1616
self.task_id = int(task_id)

tests/datasets/test_datasets.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,7 @@ def test_get_cached_dataset_arff_not_cached(self):
7171
openml.datasets.functions._get_cached_dataset_arff,
7272
3)
7373

74-
def test_list_datasets(self):
75-
# We can only perform a smoke test here because we test on dynamic
76-
# data from the internet...
77-
datasets = openml.datasets.list_datasets()
78-
# 1087 as the number of datasets on openml.org
79-
self.assertGreaterEqual(len(datasets), 1087)
80-
for dataset in datasets:
74+
def _check_dataset(self, dataset):
8175
self.assertEqual(type(dataset), dict)
8276
self.assertGreaterEqual(len(dataset), 2)
8377
self.assertIn('did', dataset)
@@ -87,18 +81,29 @@ def test_list_datasets(self):
8781
self.assertIn(dataset['status'], ['in_preparation', 'active',
8882
'deactivated'])
8983

84+
def test_list_datasets(self):
85+
# We can only perform a smoke test here because we test on dynamic
86+
# data from the internet...
87+
datasets = openml.datasets.list_datasets()
88+
# 1087 as the number of datasets on openml.org
89+
self.assertGreaterEqual(len(datasets), 1087)
90+
for did in datasets:
91+
self._check_dataset(datasets[did])
92+
9093
def test_list_datasets_by_tag(self):
91-
datasets = openml.datasets.list_datasets_by_tag('uci')
94+
datasets = openml.datasets.list_datasets(tag='uci')
9295
self.assertGreaterEqual(len(datasets), 5)
93-
for dataset in datasets:
94-
self.assertEqual(type(dataset), dict)
95-
self.assertGreaterEqual(len(dataset), 2)
96-
self.assertIn('did', dataset)
97-
self.assertIsInstance(dataset['did'], int)
98-
self.assertIn('status', dataset)
99-
self.assertTrue(is_string(dataset['status']))
100-
self.assertIn(dataset['status'], ['in_preparation', 'active',
101-
'deactivated'])
96+
for did in datasets:
97+
self._check_dataset(datasets[did])
98+
99+
def test_list_datasets_paginate(self):
100+
size = 10
101+
max = 100
102+
for i in range(0, max, size):
103+
datasets = openml.datasets.list_datasets(offset=i, size=size)
104+
self.assertGreaterEqual(size, len(datasets))
105+
for did in datasets:
106+
self._check_dataset(datasets[did])
102107

103108
@unittest.skip('See https://github.com/openml/openml-python/issues/149')
104109
def test_check_datasets_active(self):

0 commit comments

Comments
 (0)