Skip to content

Commit 500f80f

Browse files
committed
usability improvements
better error on task xml parsing rename OpenMLDataset.get_dataset to OpenMLDataset.get_data Some docstrings
1 parent d148173 commit 500f80f

7 files changed

Lines changed: 186 additions & 159 deletions

File tree

openml/_api_calls.py

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,63 +5,6 @@
55
from . import config
66

77

8-
"""
9-
Provides an interface to the OpenML server.
10-
11-
All parameters of the APIConnector can be either specified in a config
12-
file or when creating this object. The config file must be placed in a
13-
directory ``.openml`` inside the users home directory and have the name
14-
``config``. If one of the parameters is specified by passing it to the
15-
constructor of this class, it will override the value specified in the
16-
configuration file.
17-
18-
Parameters
19-
----------
20-
cache_directory : string, optional (default=None)
21-
A local directory which will be used for caching. If this is not set, a
22-
directory '.openml/cache' in the users home directory will be used.
23-
If either directory does not exist, it will be created.
24-
25-
apikey : string, optional (default=None)
26-
Your OpenML API key which will be used to authenticate you at the OpenML
27-
server.
28-
29-
server : string, optional (default=None)
30-
The OpenML server to connect to.
31-
32-
verbosity : int, optional (default=None)
33-
34-
configure_logger : bool (default=True)
35-
Whether the python logging module should be configured by the openml
36-
package. If set to true, this is a very basic configuration,
37-
which only prints to the standard output. This is only recommended
38-
for testing or small problems. It is set to True to adhere to the
39-
`specifications of the OpenML client API
40-
<https://github.com/openml/OpenML/wiki/Client-API>`_.
41-
When the openml module is used as a library, it is recommended that
42-
the main application controls the logging level, e.g. see
43-
`here <http://pieces.openpolitics.com
44-
/2012/04/python-logging-best-practices/>`_.
45-
46-
private_directory : str, optional (default=None)
47-
A local directory which can be accessed through the OpenML package.
48-
Useful to access private datasets through the same interface.
49-
50-
Raises
51-
------
52-
ValueError
53-
If apikey is neither specified in the config nor given as an argument.
54-
OpenMLServerError
55-
If the OpenML server returns an unexptected response.
56-
57-
Notes
58-
-----
59-
Testing the API calls in Firefox is possible with the Firefox AddOn
60-
HTTPRequestor.
61-
62-
"""
63-
64-
658
def _perform_api_call(call, data=None, file_dictionary=None,
669
file_elements=None, add_authentication=True):
6710
"""

openml/config.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
Stores module level information like the API key, cache director, private
3+
directory and the server.
4+
"""
15
import os
26
import sys
37
import logging
@@ -21,6 +25,16 @@
2125

2226

2327
def _setup():
28+
"""Setup openml package. Called on first import.
29+
30+
Reads the config file and sets up apikey, server, cache appropriately.
31+
key and server can be set by the user simply using
32+
openml.config.apikey = THEIRKEY
33+
openml.config.server = SOMESERVER
34+
The cache dir needs to be set up calling set_cache_directory
35+
because it needs some setup.
36+
We could also make it a property but that's less clear.
37+
"""
2438
global apikey
2539
global server
2640
# read config file, create cache directory
@@ -38,6 +52,24 @@ def _setup():
3852

3953

4054
def set_cache_directory(cachedir, privatedir):
55+
"""Set module-wide cache directory.
56+
57+
Sets the cache directory into which to download datasets, tasks etc.
58+
Also sets the private directory for storing local datasets.
59+
60+
Parameters
61+
----------
62+
cachedir : string
63+
Path to use as cache directory.
64+
65+
privatedir : string
66+
Path containing private datasets, tasks, etc.
67+
68+
See also
69+
--------
70+
get_cache_directory
71+
get_private_directory
72+
"""
4173
global _cachedir
4274
global _privatedir
4375
_cachedir = cachedir
@@ -67,6 +99,8 @@ def set_cache_directory(cachedir, privatedir):
6799

68100

69101
def _parse_config():
102+
"""Parse the config file, set up defaults.
103+
"""
70104
defaults = {'apikey': apikey,
71105
'server': server,
72106
'verbosity': 0,
@@ -99,10 +133,34 @@ def _parse_config():
99133

100134

101135
def get_cache_directory():
136+
"""Get the current cache directory.
137+
138+
Returns
139+
-------
140+
cachedir : string
141+
The current cache directory.
142+
143+
See also
144+
--------
145+
set_cache_directory
146+
get_private_directory
147+
"""
102148
return _cachedir
103149

104150

105151
def get_private_directory():
152+
"""Get the current private directory.
153+
154+
Returns
155+
-------
156+
privatecir : string
157+
The current private directory.
158+
159+
See also
160+
--------
161+
set_cache_directory
162+
get_cache_directory
163+
"""
106164
return _privatedir
107165

108166
__all__ = ["set_cache_directory", 'get_cache_directory', 'get_private_directory']

openml/datasets/dataset.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,18 @@
2222

2323

2424
class OpenMLDataset(object):
25-
25+
"""Dataset object.
26+
27+
Allows fetching and uploading datasets to OpenML.
28+
29+
Parameters
30+
----------
31+
name : string
32+
Name of the dataset
33+
description : string
34+
Description of the dataset
35+
FIXME : which of these do we actually nee?
36+
"""
2637
def __init__(self, id=None, name=None, version=None, description=None,
2738
format=None, creator=None, contributor=None,
2839
collection_date=None, upload_date=None, language=None,
@@ -63,7 +74,7 @@ def __init__(self, id=None, name=None, version=None, description=None,
6374
logger.debug("Data pickle file already exists.")
6475
else:
6576
try:
66-
data = self.get_arff()
77+
data = self._get_arff()
6778
except OSError as e:
6879
logger.critical("Please check that the data file %s is there "
6980
"and can be read.", self.data_file)
@@ -98,9 +109,7 @@ def __eq__(self, other):
98109
else:
99110
return False
100111

101-
##########################################################################
102-
# ARFF related stuff
103-
def get_arff(self):
112+
def _get_arff(self):
104113
# TODO: add a partial read method which only returns the attribute
105114
# headers of the corresponding .arff file!
106115

@@ -124,11 +133,20 @@ def decode_arff(fh):
124133
with open(filename) as fh:
125134
return decode_arff(fh)
126135

127-
##########################################################################
128-
def get_dataset(self, target=None, target_dtype=int, include_row_id=False,
129-
include_ignore_attributes=False,
130-
return_categorical_indicator=False,
131-
return_attribute_names=False):
136+
def get_data(self, target=None, target_dtype=int, include_row_id=False,
137+
include_ignore_attributes=False,
138+
return_categorical_indicator=False,
139+
return_attribute_names=False):
140+
"""Returns dataset content as numpy arrays / sparse matrices.
141+
142+
Parameters
143+
----------
144+
145+
146+
Returns
147+
-------
148+
149+
"""
132150
rval = []
133151

134152
path = self.data_pickle_file
@@ -224,6 +242,13 @@ def retrieve_class_labels(self):
224242
return None
225243

226244
def publish(self):
245+
"""Publish the dataset on the OpenML server.
246+
247+
Upload the dataset description and dataset content to openml.
248+
249+
Returns
250+
-------
251+
"""
227252
data = {'description': self.to_xml()}
228253
if self.data_file is not None:
229254
return_code, return_value = _perform_api_call(

openml/runs/run.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ def publish(self):
7777
description_xml = self.create_description_xml()
7878
data = {'predictions': predictions, 'description':
7979
description_xml}
80-
return_code, dataset_xml = _perform_api_call(
80+
return_code, return_value = _perform_api_call(
8181
"/run/", file_elements=data)
82-
return return_code, dataset_xml
82+
return return_code, return_value
8383

8484
def create_description_xml(self):
8585
run_environment = _get_version_information()
@@ -311,7 +311,7 @@ def _create_run_from_xml(xml):
311311
raise ValueError('No URL to download predictions for run %d in run '
312312
'description XML' % run_id)
313313
evaluations = dict()
314-
detailed_evaluations = defaultdict(lambda : defaultdict(dict))
314+
detailed_evaluations = defaultdict(lambda: defaultdict(dict))
315315
evaluation_flows = dict()
316316
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
317317
key = evaluation_dict['oml:name']

openml/tasks/task_functions.py

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -154,33 +154,35 @@ def _list_tasks(api_call):
154154
'"oml:runs"/@xmlns:oml is not '
155155
'"http://openml.org/openml": %s'
156156
% str(tasks_dict))
157-
158-
tasks = []
159-
procs = get_estimation_procedure_list()
160-
proc_dict = dict((x['id'], x) for x in procs)
161-
for task_ in tasks_dict['oml:tasks']['oml:task']:
162-
task = {'tid': int(task_['oml:task_id']),
163-
'did': int(task_['oml:did']),
164-
'name': task_['oml:name'],
165-
'task_type': task_['oml:task_type'],
166-
'status': task_['oml:status']}
167-
168-
# Other task inputs
169-
for input in task_.get('oml:input', list()):
170-
if input['@name'] == 'estimation_procedure':
171-
task[input['@name']] = proc_dict[int(input['#text'])]['name']
172-
else:
173-
value = input.get('#text')
174-
task[input['@name']] = value
175-
176-
task[input['@name']] = input['#text']
177-
178-
# The number of qualities can range from 0 to infinity
179-
for quality in task_.get('oml:quality', list()):
180-
quality['#text'] = float(quality['#text'])
181-
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
182-
quality['#text'] = int(quality['#text'])
183-
task[quality['@name']] = quality['#text']
157+
try:
158+
tasks = []
159+
procs = get_estimation_procedure_list()
160+
proc_dict = dict((x['id'], x) for x in procs)
161+
for task_ in tasks_dict['oml:tasks']['oml:task']:
162+
task = {'tid': int(task_['oml:task_id']),
163+
'did': int(task_['oml:did']),
164+
'name': task_['oml:name'],
165+
'task_type': task_['oml:task_type'],
166+
'status': task_['oml:status']}
167+
168+
# Other task inputs
169+
for input in task_.get('oml:input', list()):
170+
if input['@name'] == 'estimation_procedure':
171+
task[input['@name']] = proc_dict[int(input['#text'])]['name']
172+
else:
173+
value = input.get('#text')
174+
task[input['@name']] = value
175+
176+
task[input['@name']] = input['#text']
177+
178+
# The number of qualities can range from 0 to infinity
179+
for quality in task_.get('oml:quality', list()):
180+
quality['#text'] = float(quality['#text'])
181+
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
182+
quality['#text'] = int(quality['#text'])
183+
task[quality['@name']] = quality['#text']
184+
except KeyError as e:
185+
raise KeyError("Invalid xml for task: %s" % e)
184186

185187
tasks.append(task)
186188
tasks.sort(key=lambda t: t['tid'])

openml/testing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ class TestBase(unittest.TestCase):
99
1010
Note
1111
----
12-
A config file with the username and password must be present to test the
13-
API calls.
12+
Curently hard-codes a read-write key.
13+
Hopefully soon allows using a test server, not the production server.
1414
"""
1515

1616
def setUp(self):

0 commit comments

Comments
 (0)