Skip to content

Commit 5a827a3

Browse files
committed
Merge pull request #109 from amueller/usability_improvements
Usability improvements work in progress
2 parents d148173 + 2735bde commit 5a827a3

20 files changed

Lines changed: 493 additions & 337 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ target/
7272

7373
# IDE
7474
.idea
75+
*.swp
7576

7677
# Other
7778
*.pkl

doc/progress.rst

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,3 @@ API call implemented tested properly test
6666
=============================================== =========== ====== =============== ========== =====================
6767

6868
We do not plan to implement API calls marked with an **X**!
69-
70-
Convenience Functions
71-
=====================
72-
73-
=============================================== =========== ====== =============== ========== =====================
74-
Method implemented tested properly tested loads json proper error handling
75-
=============================================== =========== ====== =============== ========== =====================
76-
_get_cached_split yes
77-
_get_cached_splits yes
78-
_get_cached_dataset yes yes
79-
_get_cached_datasets yes yes
80-
get_cached_task yes
81-
get_cached_tasks yes
82-
=============================================== =========== ====== =============== ========== =====================

examples/OpenMLDemo.ipynb

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@
118118
"name": "stdout",
119119
"output_type": "stream",
120120
"text": [
121-
"First 10 of 3335 datasets...\n",
121+
"First 10 of 2806 datasets...\n",
122122
" did name NumberOfInstances NumberOfFeatures\n",
123123
"0 1 anneal 898 39\n",
124124
"1 2 anneal 898 39\n",
@@ -361,7 +361,7 @@
361361
}
362362
],
363363
"source": [
364-
"X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)\n",
364+
"X, y, attribute_names = dataset.get_data(target=dataset.default_target_attribute, return_attribute_names=True)\n",
365365
"iris = pd.DataFrame(X, columns=attribute_names)\n",
366366
"iris['class'] = y\n",
367367
"print(iris[:10])"
@@ -417,7 +417,7 @@
417417
],
418418
"source": [
419419
"dataset = openml.datasets.get_dataset(61)\n",
420-
"X, y = dataset.get_dataset(target=dataset.default_target_attribute)\n",
420+
"X, y = dataset.get_data(target=dataset.default_target_attribute)\n",
421421
"clf = ensemble.RandomForestClassifier()\n",
422422
"clf.fit(X, y)"
423423
]
@@ -464,7 +464,23 @@
464464
"metadata": {
465465
"collapsed": false
466466
},
467-
"outputs": [],
467+
"outputs": [
468+
{
469+
"data": {
470+
"text/plain": [
471+
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
472+
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
473+
" min_samples_leaf=1, min_samples_split=2,\n",
474+
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
475+
" oob_score=False, random_state=None, verbose=0,\n",
476+
" warm_start=False)"
477+
]
478+
},
479+
"execution_count": 12,
480+
"metadata": {},
481+
"output_type": "execute_result"
482+
}
483+
],
468484
"source": [
469485
"X_2d = X[:,2:4]\n",
470486
"clf.fit(X_2d, y)\n",
@@ -502,7 +518,7 @@
502518
}
503519
],
504520
"source": [
505-
"X, y, categorical = dataset.get_dataset(target=dataset.default_target_attribute,return_categorical_indicator=True)\n",
521+
"X, y, categorical = dataset.get_data(target=dataset.default_target_attribute,return_categorical_indicator=True)\n",
506522
"enc = preprocessing.OneHotEncoder(categorical_features=categorical)\n",
507523
"X = enc.fit_transform(X)\n",
508524
"clf.fit(X, y)"
@@ -537,16 +553,16 @@
537553
},
538554
"outputs": [
539555
{
540-
"ename": "TypeError",
541-
"evalue": "int() argument must be a string, a bytes-like object or a number, not 'NoneType'",
542-
"output_type": "error",
543-
"traceback": [
544-
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
545-
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
546-
"\u001b[1;32m<ipython-input-14-7653a7076e49>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtask_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopenml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlist_tasks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mtasks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtask_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"First 5 of %s tasks:\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'tid'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'did'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'name'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'task_type'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'estimation_procedure'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
547-
"\u001b[1;32m/home/andy/checkout/openml-python/openml/tasks/task_functions.py\u001b[0m in \u001b[0;36mlist_tasks\u001b[1;34m()\u001b[0m\n\u001b[0;32m 136\u001b[0m \u001b[0mthe\u001b[0m \u001b[0massociated\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msome\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthese\u001b[0m \u001b[0mare\u001b[0m \u001b[0malso\u001b[0m \u001b[0mreturned\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 137\u001b[0m \"\"\"\n\u001b[1;32m--> 138\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_list_tasks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'task/list'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 139\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 140\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
548-
"\u001b[1;32m/home/andy/checkout/openml-python/openml/tasks/task_functions.py\u001b[0m in \u001b[0;36m_list_tasks\u001b[1;34m(api_call)\u001b[0m\n\u001b[0;32m 160\u001b[0m \u001b[0mproc_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'id'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mprocs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 161\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtask_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtasks_dict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'oml:tasks'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'oml:task'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 162\u001b[1;33m task = {'tid': int(task_['oml:task_id']),\n\u001b[0m\u001b[0;32m 163\u001b[0m \u001b[1;34m'did'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtask_\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'oml:did'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 164\u001b[0m \u001b[1;34m'name'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mtask_\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'oml:name'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
549-
"\u001b[1;31mTypeError\u001b[0m: int() argument must be a string, a bytes-like object or a number, not 'NoneType'"
556+
"name": "stdout",
557+
"output_type": "stream",
558+
"text": [
559+
"First 5 of 8566 tasks:\n",
560+
" tid did name task_type estimation_procedure\n",
561+
"0 1 1 anneal Supervised Classification 10-fold Crossvalidation\n",
562+
"1 2 2 anneal Supervised Classification 10-fold Crossvalidation\n",
563+
"2 3 3 kr-vs-kp Supervised Classification 10-fold Crossvalidation\n",
564+
"3 4 4 labor Supervised Classification 10-fold Crossvalidation\n",
565+
"4 5 5 arrhythmia Supervised Classification 10-fold Crossvalidation\n"
550566
]
551567
}
552568
],
@@ -644,8 +660,8 @@
644660
"name": "stdout",
645661
"output_type": "stream",
646662
"text": [
647-
"Uploaded run with id 538163\n",
648-
"Check it at www.openml.org/r/538163\n"
663+
"Uploaded run with id 538241\n",
664+
"Check it at www.openml.org/r/538241\n"
649665
]
650666
}
651667
],
@@ -667,24 +683,6 @@
667683
"source": [
668684
"More to come soon..."
669685
]
670-
},
671-
{
672-
"cell_type": "code",
673-
"execution_count": null,
674-
"metadata": {
675-
"collapsed": true
676-
},
677-
"outputs": [],
678-
"source": []
679-
},
680-
{
681-
"cell_type": "code",
682-
"execution_count": null,
683-
"metadata": {
684-
"collapsed": true
685-
},
686-
"outputs": [],
687-
"source": []
688686
}
689687
],
690688
"metadata": {

openml/_api_calls.py

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,63 +5,6 @@
55
from . import config
66

77

8-
"""
9-
Provides an interface to the OpenML server.
10-
11-
All parameters of the APIConnector can be either specified in a config
12-
file or when creating this object. The config file must be placed in a
13-
directory ``.openml`` inside the users home directory and have the name
14-
``config``. If one of the parameters is specified by passing it to the
15-
constructor of this class, it will override the value specified in the
16-
configuration file.
17-
18-
Parameters
19-
----------
20-
cache_directory : string, optional (default=None)
21-
A local directory which will be used for caching. If this is not set, a
22-
directory '.openml/cache' in the users home directory will be used.
23-
If either directory does not exist, it will be created.
24-
25-
apikey : string, optional (default=None)
26-
Your OpenML API key which will be used to authenticate you at the OpenML
27-
server.
28-
29-
server : string, optional (default=None)
30-
The OpenML server to connect to.
31-
32-
verbosity : int, optional (default=None)
33-
34-
configure_logger : bool (default=True)
35-
Whether the python logging module should be configured by the openml
36-
package. If set to true, this is a very basic configuration,
37-
which only prints to the standard output. This is only recommended
38-
for testing or small problems. It is set to True to adhere to the
39-
`specifications of the OpenML client API
40-
<https://github.com/openml/OpenML/wiki/Client-API>`_.
41-
When the openml module is used as a library, it is recommended that
42-
the main application controls the logging level, e.g. see
43-
`here <http://pieces.openpolitics.com
44-
/2012/04/python-logging-best-practices/>`_.
45-
46-
private_directory : str, optional (default=None)
47-
A local directory which can be accessed through the OpenML package.
48-
Useful to access private datasets through the same interface.
49-
50-
Raises
51-
------
52-
ValueError
53-
If apikey is neither specified in the config nor given as an argument.
54-
OpenMLServerError
55-
If the OpenML server returns an unexptected response.
56-
57-
Notes
58-
-----
59-
Testing the API calls in Firefox is possible with the Firefox AddOn
60-
HTTPRequestor.
61-
62-
"""
63-
64-
658
def _perform_api_call(call, data=None, file_dictionary=None,
669
file_elements=None, add_authentication=True):
6710
"""

openml/config.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
Stores module level information like the API key, cache director, private
3+
directory and the server.
4+
"""
15
import os
26
import sys
37
import logging
@@ -21,6 +25,16 @@
2125

2226

2327
def _setup():
28+
"""Setup openml package. Called on first import.
29+
30+
Reads the config file and sets up apikey, server, cache appropriately.
31+
key and server can be set by the user simply using
32+
openml.config.apikey = THEIRKEY
33+
openml.config.server = SOMESERVER
34+
The cache dir needs to be set up calling set_cache_directory
35+
because it needs some setup.
36+
We could also make it a property but that's less clear.
37+
"""
2438
global apikey
2539
global server
2640
# read config file, create cache directory
@@ -38,6 +52,24 @@ def _setup():
3852

3953

4054
def set_cache_directory(cachedir, privatedir):
55+
"""Set module-wide cache directory.
56+
57+
Sets the cache directory into which to download datasets, tasks etc.
58+
Also sets the private directory for storing local datasets.
59+
60+
Parameters
61+
----------
62+
cachedir : string
63+
Path to use as cache directory.
64+
65+
privatedir : string
66+
Path containing private datasets, tasks, etc.
67+
68+
See also
69+
--------
70+
get_cache_directory
71+
get_private_directory
72+
"""
4173
global _cachedir
4274
global _privatedir
4375
_cachedir = cachedir
@@ -67,6 +99,8 @@ def set_cache_directory(cachedir, privatedir):
6799

68100

69101
def _parse_config():
102+
"""Parse the config file, set up defaults.
103+
"""
70104
defaults = {'apikey': apikey,
71105
'server': server,
72106
'verbosity': 0,
@@ -99,10 +133,34 @@ def _parse_config():
99133

100134

101135
def get_cache_directory():
136+
"""Get the current cache directory.
137+
138+
Returns
139+
-------
140+
cachedir : string
141+
The current cache directory.
142+
143+
See also
144+
--------
145+
set_cache_directory
146+
get_private_directory
147+
"""
102148
return _cachedir
103149

104150

105151
def get_private_directory():
152+
"""Get the current private directory.
153+
154+
Returns
155+
-------
156+
privatecir : string
157+
The current private directory.
158+
159+
See also
160+
--------
161+
set_cache_directory
162+
get_cache_directory
163+
"""
106164
return _privatedir
107165

108166
__all__ = ["set_cache_directory", 'get_cache_directory', 'get_private_directory']

openml/datasets/__init__.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
from .functions import (list_datasets, list_datasets_by_tag,
2-
check_datasets_active, get_datasets, get_dataset,
3-
_get_dataset_description,
4-
_get_dataset_features, _get_dataset_qualities)
2+
check_datasets_active, get_datasets, get_dataset)
53
from .dataset import OpenMLDataset
64

75
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
8-
'get_datasets_arf', '_get_dataset_features',
9-
'_get_dataset_qualities', 'OpenMLDataset', 'list_datasets',
10-
'list_datasets_by_tag',
11-
'_get_dataset_description', 'list_datasets']
6+
'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
7+
'list_datasets']

0 commit comments

Comments
 (0)