Skip to content

Commit 7688c84

Browse files
committed
Merge branch 'develop' into update_examples
2 parents c559d11 + 46ec3ab commit 7688c84

4 files changed

Lines changed: 288 additions & 85 deletions

File tree

openml/config.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,54 @@
3838
connection_n_retries = _defaults['connection_n_retries']
3939

4040

41+
class ConfigurationForExamples:
42+
""" Allows easy switching to and from a test configuration, used for examples. """
43+
_last_used_server = None
44+
_last_used_key = None
45+
_start_last_called = False
46+
_test_server = "https://test.openml.org/api/v1/xml"
47+
_test_apikey = "c0c42819af31e706efe1f4b88c23c6c1"
48+
49+
@classmethod
50+
def start_using_configuration_for_example(cls):
51+
""" Sets the configuration to connect to the test server with valid apikey.
52+
53+
To configuration as was before this call is stored, and can be recovered
54+
by using the `stop_use_example_configuration` method.
55+
"""
56+
global server
57+
global apikey
58+
59+
if cls._start_last_called and server == cls._test_server and apikey == cls._test_apikey:
60+
# Method is called more than once in a row without modifying the server or apikey.
61+
# We don't want to save the current test configuration as a last used configuration.
62+
return
63+
64+
cls._last_used_server = server
65+
cls._last_used_key = apikey
66+
cls._start_last_called = True
67+
68+
# Test server key for examples
69+
server = cls._test_server
70+
apikey = cls._test_apikey
71+
72+
@classmethod
73+
def stop_using_configuration_for_example(cls):
74+
""" Return to configuration as it was before `start_use_example_configuration`. """
75+
if not cls._start_last_called:
76+
# We don't want to allow this because it will (likely) result in the `server` and
77+
# `apikey` variables being set to None.
78+
raise RuntimeError("`stop_use_example_configuration` called without a saved config."
79+
"`start_use_example_configuration` must be called first.")
80+
81+
global server
82+
global apikey
83+
84+
server = cls._last_used_server
85+
apikey = cls._last_used_key
86+
cls._start_last_called = False
87+
88+
4189
def _setup():
4290
"""Setup openml package. Called on first import.
4391
@@ -140,8 +188,18 @@ def set_cache_directory(cachedir):
140188
cache_directory = cachedir
141189

142190

191+
start_using_configuration_for_example = (
192+
ConfigurationForExamples.start_using_configuration_for_example
193+
)
194+
stop_using_configuration_for_example = (
195+
ConfigurationForExamples.stop_using_configuration_for_example
196+
)
197+
143198
__all__ = [
144-
'get_cache_directory', 'set_cache_directory'
199+
'get_cache_directory',
200+
'set_cache_directory',
201+
'start_using_configuration_for_example',
202+
'stop_using_configuration_for_example',
145203
]
146204

147205
_setup()

openml/datasets/functions.py

Lines changed: 79 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import io
22
import os
33
import re
4-
from typing import List, Dict, Union
4+
from typing import List, Dict, Union, Optional
55

66
import numpy as np
77
import arff
@@ -247,19 +247,20 @@ def __list_datasets(api_call):
247247

248248
datasets = dict()
249249
for dataset_ in datasets_dict['oml:data']['oml:dataset']:
250-
did = int(dataset_['oml:did'])
251-
dataset = {'did': did,
252-
'name': dataset_['oml:name'],
253-
'format': dataset_['oml:format'],
254-
'status': dataset_['oml:status']}
250+
ignore_attributes = ['oml:file_id', 'oml:quality']
251+
dataset = {k.replace('oml:', ''): v
252+
for (k, v) in dataset_.items()
253+
if k not in ignore_attributes}
254+
dataset['did'] = int(dataset['did'])
255+
dataset['version'] = int(dataset['version'])
255256

256257
# The number of qualities can range from 0 to infinity
257258
for quality in dataset_.get('oml:quality', list()):
258-
quality['#text'] = float(quality['#text'])
259-
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
260-
quality['#text'] = int(quality['#text'])
261-
dataset[quality['@name']] = quality['#text']
262-
datasets[did] = dataset
259+
try:
260+
dataset[quality['@name']] = int(quality['#text'])
261+
except ValueError:
262+
dataset[quality['@name']] = float(quality['#text'])
263+
datasets[dataset['did']] = dataset
263264

264265
return datasets
265266

@@ -298,6 +299,47 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
298299
return active
299300

300301

302+
def _name_to_id(
303+
dataset_name: str,
304+
version: Optional[int] = None,
305+
error_if_multiple: bool = False
306+
) -> int:
307+
""" Attempt to find the dataset id of the dataset with the given name.
308+
309+
If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
310+
then return the least recent still active dataset.
311+
312+
Raises an error if no dataset with the name is found.
313+
Raises an error if a version is specified but it could not be found.
314+
315+
Parameters
316+
----------
317+
dataset_name : str
318+
The name of the dataset for which to find its id.
319+
version : int
320+
Version to retrieve. If not specified, the oldest active version is returned.
321+
error_if_multiple : bool (default=False)
322+
If `False`, if multiple datasets match, return the least recent active dataset.
323+
If `True`, if multiple datasets match, raise an error.
324+
325+
Returns
326+
-------
327+
int
328+
The id of the dataset.
329+
"""
330+
status = None if version is not None else 'active'
331+
candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
332+
if error_if_multiple and len(candidates) > 1:
333+
raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
334+
if len(candidates) == 0:
335+
no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
336+
and_version = " and version {}".format(version) if version is not None else ""
337+
raise RuntimeError(no_dataset_for_name + and_version)
338+
339+
# Dataset ids are chronological so we can just sort based on ids (instead of version)
340+
return sorted(candidates)[0]
341+
342+
301343
def get_datasets(
302344
dataset_ids: List[Union[str, int]],
303345
download_data: bool = True,
@@ -309,7 +351,8 @@ def get_datasets(
309351
Parameters
310352
----------
311353
dataset_ids : iterable
312-
Integers or strings representing dataset ids.
354+
Integers or strings representing dataset ids or dataset names.
355+
If dataset names are specified, the least recent still active dataset version is returned.
313356
download_data : bool, optional
314357
If True, also download the data file. Beware that some datasets are large and it might
315358
make the operation noticeably slower. Metadata is also still retrieved.
@@ -328,13 +371,23 @@ def get_datasets(
328371

329372

330373
@openml.utils.thread_safe_if_oslo_installed
331-
def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> OpenMLDataset:
374+
def get_dataset(
375+
dataset_id: Union[int, str],
376+
download_data: bool = True,
377+
version: int = None,
378+
error_if_multiple: bool = False
379+
) -> OpenMLDataset:
332380
""" Download the OpenML dataset representation, optionally also download actual data file.
333381
334382
This function is thread/multiprocessing safe.
335383
This function uses caching. A check will be performed to determine if the information has
336384
previously been downloaded, and if so be loaded from disk instead of retrieved from the server.
337385
386+
If dataset is retrieved by name, a version may be specified.
387+
If no version is specified and multiple versions of the dataset exist,
388+
the earliest version of the dataset that is still active will be returned.
389+
This scenario will raise an error instead if `exception_if_multiple` is `True`.
390+
338391
Parameters
339392
----------
340393
dataset_id : int or str
@@ -344,16 +397,24 @@ def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> Open
344397
make the operation noticeably slower. Metadata is also still retrieved.
345398
If False, create the OpenMLDataset and only populate it with the metadata.
346399
The data may later be retrieved through the `OpenMLDataset.get_data` method.
400+
version : int, optional (default=None)
401+
Specifies the version if `dataset_id` is specified by name.
402+
If no version is specified, retrieve the least recent still active version.
403+
error_if_multiple : bool, optional (default=False)
404+
If `True` raise an error if multiple datasets are found with matching criteria.
347405
348406
Returns
349407
-------
350408
dataset : :class:`openml.OpenMLDataset`
351409
The downloaded dataset."""
352-
try:
353-
dataset_id = int(dataset_id)
354-
except (ValueError, TypeError):
355-
raise ValueError("Dataset ID is neither an Integer nor can be "
356-
"cast to an Integer.")
410+
if isinstance(dataset_id, str):
411+
try:
412+
dataset_id = int(dataset_id)
413+
except ValueError:
414+
dataset_id = _name_to_id(dataset_id, version, error_if_multiple) # type: ignore
415+
elif not isinstance(dataset_id, int):
416+
raise TypeError("`dataset_id` must be one of `str` or `int`, not {}."
417+
.format(type(dataset_id)))
357418

358419
did_cache_dir = _create_cache_directory_for_id(
359420
DATASETS_CACHE_DIR_NAME, dataset_id,

0 commit comments

Comments
 (0)