Skip to content

Commit a0b65cd

Browse files
authored
Merge branch 'develop' into FIX_upstream_410
2 parents 5d19edb + 58c4218 commit a0b65cd

38 files changed

Lines changed: 2244 additions & 513 deletions

.travis.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ env:
1515
- TEST_DIR=/tmp/test_dir/
1616
- MODULE=openml
1717
matrix:
18-
- DISTRIB="conda" PYTHON_VERSION="2.7" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.21" SKLEARN_VERSION="0.18"
19-
- DISTRIB="conda" PYTHON_VERSION="3.4" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" SKLEARN_VERSION="0.18"
20-
- DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" SKLEARN_VERSION="0.18"
18+
- DISTRIB="conda" PYTHON_VERSION="2.7" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.21" SKLEARN_VERSION="0.18.1"
19+
- DISTRIB="conda" PYTHON_VERSION="3.4" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" SKLEARN_VERSION="0.18.1"
20+
- DISTRIB="conda" PYTHON_VERSION="3.5" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" SKLEARN_VERSION="0.18.1"
21+
- DISTRIB="conda" PYTHON_VERSION="3.6" COVERAGE="true" NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.25.2" SKLEARN_VERSION="0.18.1"
22+
2123
install: source ci_scripts/install.sh
2224
script: bash ci_scripts/test.sh
2325
after_success: source ci_scripts/success.sh

openml/__init__.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,53 @@
1818

1919
from .datasets import OpenMLDataset, OpenMLDataFeature
2020
from . import datasets
21+
from . import tasks
2122
from . import runs
2223
from . import flows
24+
from . import setups
2325
from .runs import OpenMLRun
2426
from .tasks import OpenMLTask, OpenMLSplit
2527
from .flows import OpenMLFlow
2628

29+
__version__ = "0.4.0dev"
30+
31+
32+
def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
33+
run_ids=None):
34+
"""
35+
Populate a cache for offline and parallel usage of the OpenML connector.
36+
37+
Parameters
38+
----------
39+
task_ids : iterable
40+
41+
dataset_ids : iterable
42+
43+
flow_ids : iterable
44+
45+
run_ids : iterable
46+
47+
Returns
48+
-------
49+
None
50+
"""
51+
if task_ids is not None:
52+
for task_id in task_ids:
53+
tasks.functions.get_task(task_id)
54+
55+
if dataset_ids is not None:
56+
for dataset_id in dataset_ids:
57+
datasets.functions.get_dataset(dataset_id)
58+
59+
if flow_ids is not None:
60+
for flow_id in flow_ids:
61+
flows.functions.get_flow(flow_id)
62+
63+
if run_ids is not None:
64+
for run_id in run_ids:
65+
runs.functions.get_run(run_id)
2766

28-
__version__ = "0.2.1"
2967

3068
__all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
3169
'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
32-
'config', 'runs', 'flows']
70+
'config', 'runs', 'flows', 'tasks', 'setups']

openml/_api_calls.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import io
22
import os
33
import requests
4-
import arff
54
import warnings
5+
6+
import arff
67
import xmltodict
78

89
from . import config
@@ -51,6 +52,18 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
5152
return _read_url(url, data)
5253

5354

55+
def _file_id_to_url(file_id, filename=None):
56+
'''
57+
Presents the URL how to download a given file id
58+
filename is optional
59+
'''
60+
openml_url = config.server.split('/api/')
61+
url = openml_url[0] + '/data/download/%s' %file_id
62+
if filename is not None:
63+
url += '/' + filename
64+
return url
65+
66+
5467
def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
5568
"""do a post request to url with data, file content of
5669
file_dictionary and sending file_elements as files"""
@@ -110,7 +123,9 @@ def _parse_server_exception(response):
110123
try:
111124
server_exception = xmltodict.parse(response.text)
112125
except:
113-
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
126+
raise OpenMLServerError(('Unexpected server error. Please '
127+
'contact the developers!\nStatus code: '
128+
'%d\n' % response.status_code) + response.text)
114129

115130
code = int(server_exception['oml:error']['oml:code'])
116131
message = server_exception['oml:error']['oml:message']

openml/config.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
"""
22
Stores module level information like the API key, cache directory and the server.
33
"""
4-
import os
5-
import sys
64
import logging
5+
import os
6+
7+
from six import StringIO
8+
from six.moves import configparser
9+
710

811
logger = logging.getLogger(__name__)
912
logging.basicConfig(
@@ -15,12 +18,7 @@
1518
cachedir = ""
1619

1720

18-
if sys.version_info[0] < 3:
19-
import ConfigParser as configparser
20-
from StringIO import StringIO
21-
else:
22-
import configparser
23-
from io import StringIO
21+
2422

2523

2624
def _setup():

openml/datasets/dataset.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,11 @@
99

1010
import numpy as np
1111
import scipy.sparse
12+
from six.moves import cPickle as pickle
1213
import xmltodict
1314

1415
from .data_feature import OpenMLDataFeature
1516
from ..exceptions import PyOpenMLError
16-
17-
if sys.version_info[0] >= 3:
18-
import pickle
19-
else:
20-
try:
21-
import cPickle as pickle
22-
except:
23-
import pickle
24-
25-
26-
from ..util import is_string
2717
from .._api_calls import _perform_api_call
2818

2919
logger = logging.getLogger(__name__)
@@ -49,7 +39,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
4939
row_id_attribute=None, ignore_attribute=None,
5040
version_label=None, citation=None, tag=None, visibility=None,
5141
original_data_url=None, paper_url=None, update_comment=None,
52-
md5_checksum=None, data_file=None, features=None):
42+
md5_checksum=None, data_file=None, features=None, qualities=None):
5343
# Attributes received by querying the RESTful API
5444
self.dataset_id = int(dataset_id) if dataset_id is not None else None
5545
self.name = name
@@ -84,6 +74,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
8474
self.md5_cheksum = md5_checksum
8575
self.data_file = data_file
8676
self.features = None
77+
self.qualities = None
8778

8879
if features is not None:
8980
self.features = {}
@@ -97,6 +88,12 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
9788
raise ValueError('Data features not provided in right order')
9889
self.features[feature.index] = feature
9990

91+
if qualities is not None:
92+
self.qualities = {}
93+
for idx, xmlquality in enumerate(qualities['oml:quality']):
94+
name = xmlquality['oml:name']
95+
value = xmlquality['oml:value']
96+
self.qualities[name] = value
10097

10198
if data_file is not None:
10299
if self._data_features_supported():
@@ -219,7 +216,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
219216
if not self.row_id_attribute:
220217
pass
221218
else:
222-
if is_string(self.row_id_attribute):
219+
if isinstance(self.row_id_attribute, six.string_types):
223220
to_exclude.append(self.row_id_attribute)
224221
else:
225222
to_exclude.extend(self.row_id_attribute)
@@ -243,7 +240,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
243240
if target is None:
244241
rval.append(data)
245242
else:
246-
if is_string(target):
243+
if isinstance(target, six.string_types):
247244
target = [target]
248245
targets = np.array([True if column in target else False
249246
for column in attribute_names])

openml/datasets/functions.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
from collections import OrderedDict
12
import io
23
import os
34
import re
45
import shutil
5-
from collections import OrderedDict
6+
67
import xmltodict
8+
79
from .dataset import OpenMLDataset
810
from ..exceptions import OpenMLCacheException
911
from .. import config
@@ -73,7 +75,8 @@ def _get_cached_dataset(dataset_id):
7375
description = _get_cached_dataset_description(dataset_id)
7476
arff_file = _get_cached_dataset_arff(dataset_id)
7577
features = _get_cached_dataset_features(dataset_id)
76-
dataset = _create_dataset_from_description(description, features, arff_file)
78+
qualities = _get_cached_dataset_qualities(dataset_id)
79+
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
7780

7881
return dataset
7982

@@ -105,6 +108,19 @@ def _get_cached_dataset_features(dataset_id):
105108
"cached" % dataset_id)
106109

107110

111+
def _get_cached_dataset_qualities(dataset_id):
112+
cache_dir = config.get_cache_directory()
113+
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
114+
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
115+
try:
116+
with io.open(qualities_file, encoding='utf8') as fh:
117+
qualities_xml = fh.read()
118+
return xmltodict.parse(qualities_xml)["oml:data_qualities"]
119+
except (IOError, OSError):
120+
raise OpenMLCacheException("Dataset qualities for dataset id %d not "
121+
"cached" % dataset_id)
122+
123+
108124
def _get_cached_dataset_arff(dataset_id):
109125
cache_dir = config.get_cache_directory()
110126
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
@@ -270,7 +286,7 @@ def get_dataset(dataset_id):
270286
_remove_dataset_cache_dir(did_cache_dir)
271287
raise e
272288

273-
dataset = _create_dataset_from_description(description, features, arff_file)
289+
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
274290
return dataset
275291

276292

@@ -468,7 +484,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
468484
'Please do this manually!' % did_cache_dir)
469485

470486

471-
def _create_dataset_from_description(description, features, arff_file):
487+
def _create_dataset_from_description(description, features, qualities, arff_file):
472488
"""Create a dataset object from a description dict.
473489
474490
Parameters
@@ -508,5 +524,6 @@ def _create_dataset_from_description(description, features, arff_file):
508524
description.get("oml:update_comment"),
509525
description.get("oml:md5_checksum"),
510526
data_file=arff_file,
511-
features=features)
527+
features=features,
528+
qualities=qualities)
512529
return dataset

openml/exceptions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
class PyOpenMLError(Exception):
22
def __init__(self, message):
3+
self.message = message
34
super(PyOpenMLError, self).__init__(message)
45

6+
57
class OpenMLServerError(PyOpenMLError):
68
"""class for when something is really wrong on the server
79
(result did not parse to dict), contains unparsed error."""
810

911
def __init__(self, message):
10-
message = "OpenML Server error: " + message
1112
super(OpenMLServerError, self).__init__(message)
1213

1314
#
@@ -18,7 +19,6 @@ class OpenMLServerException(OpenMLServerError):
1819
def __init__(self, code, message, additional=None):
1920
self.code = code
2021
self.additional = additional
21-
message = "OpenML Server exception: " + message
2222
super(OpenMLServerException, self).__init__(message)
2323

2424

openml/flows/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
from .flow import OpenMLFlow
2-
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
3-
from .functions import get_flow, list_flows, flow_exists
1+
from .flow import OpenMLFlow, _copy_server_fields
2+
3+
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, _check_n_jobs
4+
from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
45

56
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
67
'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']

0 commit comments

Comments
 (0)