Skip to content

Commit 7129cf0

Browse files
Neeratyoymfeurer
authored andcommitted
Option to return dataframes for listing functions (#662)
* Adding dataframe output option to listing functions * Adding 'object' as new output format for listing * Editing examples for dataframe output option * Implementing coding standards as per suggestions. * Adding test cases for listing as dataframe * Convert list to List * Fixing rebase bugs, flake errors and test cases * Fixing new unit test for flow * Fixing bug in unit test for flow * Fixing test case bug * Update functions.py * Update functions.py * Update functions.py
1 parent 72f131a commit 7129cf0

17 files changed

Lines changed: 556 additions & 167 deletions

File tree

examples/datasets_tutorial.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,12 @@
1010
import pandas as pd
1111

1212
############################################################################
13-
# List datasets
14-
# =============
13+
# Exercise 0
14+
# **********
15+
#
16+
# * List datasets
17+
# * Use the output_format parameter to select output type
18+
# * Default gives 'dict' (other option: 'dataframe')
1519

1620
openml_list = openml.datasets.list_datasets() # returns a dict
1721

@@ -25,6 +29,10 @@
2529
print("First 10 of %s datasets..." % len(datalist))
2630
datalist.head(n=10)
2731

32+
# The same can be done with lesser lines of code
33+
openml_df = openml.datasets.list_datasets(output_format='dataframe')
34+
openml_df.head(n=10)
35+
2836
############################################################################
2937
# Exercise 1
3038
# **********

examples/tasks_tutorial.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@
4242
print("First 5 of %s tasks:" % len(tasks))
4343
pprint(tasks.head())
4444

45+
# The same can be obtained through lesser lines of code
46+
tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe')
47+
pprint(tasks_df.head())
48+
4549
############################################################################
4650
# We can filter the list of tasks to only contain datasets with more than
4751
# 500 samples, but less than 1000 samples:

openml/datasets/functions.py

Lines changed: 57 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ def _get_cached_datasets():
8282
return datasets
8383

8484

85-
def _get_cached_dataset(dataset_id):
85+
def _get_cached_dataset(
86+
dataset_id: int
87+
) -> OpenMLDataset:
8688
"""Get cached dataset for ID.
8789
8890
Returns
@@ -163,7 +165,14 @@ def _get_cache_directory(dataset: OpenMLDataset) -> str:
163165
return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
164166

165167

166-
def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs):
168+
def list_datasets(
169+
offset: Optional[int] = None,
170+
size: Optional[int] = None,
171+
status: Optional[str] = None,
172+
tag: Optional[str] = None,
173+
output_format: str = 'dict',
174+
**kwargs
175+
) -> Union[Dict, pd.DataFrame]:
167176

168177
"""
169178
Return a list of all dataset which are on OpenML.
@@ -180,61 +189,83 @@ def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs):
180189
default active datasets are returned, but also datasets
181190
from another status can be requested.
182191
tag : str, optional
192+
output_format: str, optional (default='dict')
193+
The parameter decides the format of the output.
194+
- If 'dict' the output is a dict of dict
195+
- If 'dataframe' the output is a pandas DataFrame
183196
kwargs : dict, optional
184197
Legal filter operators (keys in the dict):
185198
data_name, data_version, number_instances,
186199
number_features, number_classes, number_missing_values.
187200
188201
Returns
189202
-------
190-
datasets : dict of dicts
191-
A mapping from dataset ID to dict.
192-
193-
Every dataset is represented by a dictionary containing
194-
the following information:
195-
- dataset id
196-
- name
197-
- format
198-
- status
199-
200-
If qualities are calculated for the dataset, some of
201-
these are also returned.
203+
datasets : dict of dicts, or dataframe
204+
- If output_format='dict'
205+
A mapping from dataset ID to dict.
206+
207+
Every dataset is represented by a dictionary containing
208+
the following information:
209+
- dataset id
210+
- name
211+
- format
212+
- status
213+
If qualities are calculated for the dataset, some of
214+
these are also returned.
215+
216+
- If output_format='dataframe'
217+
Each row maps to a dataset
218+
Each column contains the following information:
219+
- dataset id
220+
- name
221+
- format
222+
- status
223+
If qualities are calculated for the dataset, some of
224+
these are also included as columns.
202225
"""
226+
if output_format not in ['dataframe', 'dict']:
227+
raise ValueError("Invalid output format selected. "
228+
"Only 'dict' or 'dataframe' applicable.")
203229

204-
return openml.utils._list_all(_list_datasets,
230+
return openml.utils._list_all(output_format=output_format,
231+
listing_call=_list_datasets,
205232
offset=offset,
206233
size=size,
207234
status=status,
208235
tag=tag,
209236
**kwargs)
210237

211238

212-
def _list_datasets(**kwargs):
239+
def _list_datasets(output_format='dict', **kwargs):
213240

214241
"""
215242
Perform api call to return a list of all datasets.
216243
217244
Parameters
218245
----------
246+
output_format: str, optional (default='dict')
247+
The parameter decides the format of the output.
248+
- If 'dict' the output is a dict of dict
249+
- If 'dataframe' the output is a pandas DataFrame
219250
kwargs : dict, optional
220251
Legal filter operators (keys in the dict):
221252
tag, status, limit, offset, data_name, data_version, number_instances,
222253
number_features, number_classes, number_missing_values.
223254
224255
Returns
225256
-------
226-
datasets : dict of dicts
257+
datasets : dict of dicts, or dataframe
227258
"""
228259

229260
api_call = "data/list"
230261

231262
if kwargs is not None:
232263
for operator, value in kwargs.items():
233264
api_call += "/%s/%s" % (operator, value)
234-
return __list_datasets(api_call)
265+
return __list_datasets(api_call=api_call, output_format=output_format)
235266

236267

237-
def __list_datasets(api_call):
268+
def __list_datasets(api_call, output_format='dict'):
238269

239270
xml_string = openml._api_calls._perform_api_call(api_call, 'get')
240271
datasets_dict = xmltodict.parse(xml_string, force_list=('oml:dataset',))
@@ -262,6 +293,9 @@ def __list_datasets(api_call):
262293
dataset[quality['@name']] = float(quality['#text'])
263294
datasets[dataset['did']] = dataset
264295

296+
if output_format == 'dataframe':
297+
datasets = pd.DataFrame.from_dict(datasets, orient='index')
298+
265299
return datasets
266300

267301

@@ -341,8 +375,8 @@ def _name_to_id(
341375

342376

343377
def get_datasets(
344-
dataset_ids: List[Union[str, int]],
345-
download_data: bool = True,
378+
dataset_ids: List[Union[str, int]],
379+
download_data: bool = True,
346380
) -> List[OpenMLDataset]:
347381
"""Download datasets.
348382
@@ -667,8 +701,8 @@ def create_dataset(name, description, creator, contributor,
667701
do not construct a valid ARFF file")
668702

669703
return OpenMLDataset(
670-
name,
671-
description,
704+
name=name,
705+
description=description,
672706
data_format=data_format,
673707
creator=creator,
674708
contributor=contributor,

openml/evaluations/functions.py

Lines changed: 92 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,26 @@
11
import json
22
import xmltodict
3+
import pandas as pd
4+
from typing import Union, List, Optional, Dict
35

46
import openml.utils
57
import openml._api_calls
68
from ..evaluations import OpenMLEvaluation
79

810

9-
def list_evaluations(function, offset=None, size=None, id=None, task=None,
10-
setup=None, flow=None, uploader=None, tag=None,
11-
per_fold=None):
11+
def list_evaluations(
12+
function: str,
13+
offset: Optional[int] = None,
14+
size: Optional[int] = None,
15+
id: Optional[List] = None,
16+
task: Optional[List] = None,
17+
setup: Optional[List] = None,
18+
flow: Optional[List] = None,
19+
uploader: Optional[List] = None,
20+
tag: Optional[str] = None,
21+
per_fold: Optional[bool] = None,
22+
output_format: str = 'object'
23+
) -> Union[Dict, pd.DataFrame]:
1224
"""
1325
List all run-evaluation pairs matching all of the given filters.
1426
(Supports large amount of results)
@@ -36,21 +48,48 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None,
3648
3749
per_fold : bool, optional
3850
51+
output_format: str, optional (default='object')
52+
The parameter decides the format of the output.
53+
- If 'object' the output is a dict of OpenMLEvaluation objects
54+
- If 'dict' the output is a dict of dict
55+
- If 'dataframe' the output is a pandas DataFrame
56+
3957
Returns
4058
-------
41-
dict
59+
dict or dataframe
4260
"""
43-
if per_fold is not None:
44-
per_fold = str(per_fold).lower()
45-
46-
return openml.utils._list_all(_list_evaluations, function, offset=offset,
47-
size=size, id=id, task=task, setup=setup,
48-
flow=flow, uploader=uploader, tag=tag,
49-
per_fold=per_fold)
61+
if output_format not in ['dataframe', 'dict', 'object']:
62+
raise ValueError("Invalid output format selected. "
63+
"Only 'object', 'dataframe', or 'dict' applicable.")
5064

51-
52-
def _list_evaluations(function, id=None, task=None,
53-
setup=None, flow=None, uploader=None, **kwargs):
65+
per_fold_str = None
66+
if per_fold is not None:
67+
per_fold_str = str(per_fold).lower()
68+
69+
return openml.utils._list_all(output_format=output_format,
70+
listing_call=_list_evaluations,
71+
function=function,
72+
offset=offset,
73+
size=size,
74+
id=id,
75+
task=task,
76+
setup=setup,
77+
flow=flow,
78+
uploader=uploader,
79+
tag=tag,
80+
per_fold=per_fold_str)
81+
82+
83+
def _list_evaluations(
84+
function: str,
85+
id: Optional[List] = None,
86+
task: Optional[List] = None,
87+
setup: Optional[List] = None,
88+
flow: Optional[List] = None,
89+
uploader: Optional[List] = None,
90+
output_format: str = 'object',
91+
**kwargs
92+
) -> Union[Dict, pd.DataFrame]:
5493
"""
5594
Perform API call ``/evaluation/function{function}/{filters}``
5695
@@ -75,9 +114,17 @@ def _list_evaluations(function, id=None, task=None,
75114
kwargs: dict, optional
76115
Legal filter operators: tag, limit, offset.
77116
117+
output_format: str, optional (default='dict')
118+
The parameter decides the format of the output.
119+
- If 'dict' the output is a dict of dict
120+
The parameter decides the format of the output.
121+
- If 'dict' the output is a dict of dict
122+
- If 'dataframe' the output is a pandas DataFrame
123+
- If 'dataframe' the output is a pandas DataFrame
124+
78125
Returns
79126
-------
80-
dict
127+
dict of objects, or dataframe
81128
"""
82129

83130
api_call = "evaluation/list/function/%s" % function
@@ -95,10 +142,10 @@ def _list_evaluations(function, id=None, task=None,
95142
if uploader is not None:
96143
api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
97144

98-
return __list_evaluations(api_call)
145+
return __list_evaluations(api_call, output_format=output_format)
99146

100147

101-
def __list_evaluations(api_call):
148+
def __list_evaluations(api_call, output_format='object'):
102149
"""Helper function to parse API calls which are lists of runs"""
103150
xml_string = openml._api_calls._perform_api_call(api_call, 'get')
104151
evals_dict = xmltodict.parse(xml_string, force_list=('oml:evaluation',))
@@ -123,15 +170,33 @@ def __list_evaluations(api_call):
123170
if 'oml:array_data' in eval_:
124171
array_data = eval_['oml:array_data']
125172

126-
evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
127-
int(eval_['oml:task_id']),
128-
int(eval_['oml:setup_id']),
129-
int(eval_['oml:flow_id']),
130-
eval_['oml:flow_name'],
131-
eval_['oml:data_id'],
132-
eval_['oml:data_name'],
133-
eval_['oml:function'],
134-
eval_['oml:upload_time'],
135-
value, values, array_data)
173+
if output_format == 'object':
174+
evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
175+
int(eval_['oml:task_id']),
176+
int(eval_['oml:setup_id']),
177+
int(eval_['oml:flow_id']),
178+
eval_['oml:flow_name'],
179+
eval_['oml:data_id'],
180+
eval_['oml:data_name'],
181+
eval_['oml:function'],
182+
eval_['oml:upload_time'],
183+
value, values, array_data)
184+
else:
185+
# for output_format in ['dict', 'dataframe']
186+
evals[run_id] = {'run_id': int(eval_['oml:run_id']),
187+
'task_id': int(eval_['oml:task_id']),
188+
'setup_id': int(eval_['oml:setup_id']),
189+
'flow_id': int(eval_['oml:flow_id']),
190+
'flow_name': eval_['oml:flow_name'],
191+
'data_id': eval_['oml:data_id'],
192+
'data_name': eval_['oml:data_name'],
193+
'function': eval_['oml:function'],
194+
'upload_time': eval_['oml:upload_time'],
195+
'value': value,
196+
'values': values,
197+
'array_data': array_data}
198+
199+
if output_format == 'dataframe':
200+
evals = pd.DataFrame.from_dict(evals, orient='index')
136201

137202
return evals

0 commit comments

Comments
 (0)