11import io
22import os
33import re
4- from typing import List , Dict , Union
4+ from typing import List , Dict , Union , Optional
55
66import numpy as np
77import arff
@@ -247,19 +247,20 @@ def __list_datasets(api_call):
247247
248248 datasets = dict ()
249249 for dataset_ in datasets_dict ['oml:data' ]['oml:dataset' ]:
250- did = int (dataset_ ['oml:did' ])
251- dataset = {'did' : did ,
252- 'name' : dataset_ ['oml:name' ],
253- 'format' : dataset_ ['oml:format' ],
254- 'status' : dataset_ ['oml:status' ]}
250+ ignore_attributes = ['oml:file_id' , 'oml:quality' ]
251+ dataset = {k .replace ('oml:' , '' ): v
252+ for (k , v ) in dataset_ .items ()
253+ if k not in ignore_attributes }
254+ dataset ['did' ] = int (dataset ['did' ])
255+ dataset ['version' ] = int (dataset ['version' ])
255256
256257 # The number of qualities can range from 0 to infinity
257258 for quality in dataset_ .get ('oml:quality' , list ()):
258- quality [ '#text' ] = float ( quality [ '#text' ])
259- if abs ( int ( quality ['#text' ]) - quality ['#text' ]) < 0.0000001 :
260- quality [ '#text' ] = int ( quality [ '#text' ])
261- dataset [quality ['@name' ]] = quality ['#text' ]
262- datasets [did ] = dataset
259+ try :
260+ dataset [ quality ['@name' ]] = int ( quality ['#text' ])
261+ except ValueError :
262+ dataset [quality ['@name' ]] = float ( quality ['#text' ])
263+ datasets [dataset [ ' did' ] ] = dataset
263264
264265 return datasets
265266
@@ -298,6 +299,47 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
298299 return active
299300
300301
302+ def _name_to_id (
303+ dataset_name : str ,
304+ version : Optional [int ] = None ,
305+ error_if_multiple : bool = False
306+ ) -> int :
307+ """ Attempt to find the dataset id of the dataset with the given name.
308+
309+ If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
310+ then return the least recent still active dataset.
311+
312+ Raises an error if no dataset with the name is found.
313+ Raises an error if a version is specified but it could not be found.
314+
315+ Parameters
316+ ----------
317+ dataset_name : str
318+ The name of the dataset for which to find its id.
319+ version : int
320+ Version to retrieve. If not specified, the oldest active version is returned.
321+ error_if_multiple : bool (default=False)
322+ If `False`, if multiple datasets match, return the least recent active dataset.
323+ If `True`, if multiple datasets match, raise an error.
324+
325+ Returns
326+ -------
327+ int
328+ The id of the dataset.
329+ """
330+ status = None if version is not None else 'active'
331+ candidates = list_datasets (data_name = dataset_name , status = status , data_version = version )
332+ if error_if_multiple and len (candidates ) > 1 :
333+ raise ValueError ("Multiple active datasets exist with name {}" .format (dataset_name ))
334+ if len (candidates ) == 0 :
335+ no_dataset_for_name = "No active datasets exist with name {}" .format (dataset_name )
336+ and_version = " and version {}" .format (version ) if version is not None else ""
337+ raise RuntimeError (no_dataset_for_name + and_version )
338+
339+ # Dataset ids are chronological so we can just sort based on ids (instead of version)
340+ return sorted (candidates )[0 ]
341+
342+
301343def get_datasets (
302344 dataset_ids : List [Union [str , int ]],
303345 download_data : bool = True ,
@@ -309,7 +351,8 @@ def get_datasets(
309351 Parameters
310352 ----------
311353 dataset_ids : iterable
312- Integers or strings representing dataset ids.
354+ Integers or strings representing dataset ids or dataset names.
355+ If dataset names are specified, the least recent still active dataset version is returned.
313356 download_data : bool, optional
314357 If True, also download the data file. Beware that some datasets are large and it might
315358 make the operation noticeably slower. Metadata is also still retrieved.
@@ -328,13 +371,23 @@ def get_datasets(
328371
329372
330373@openml .utils .thread_safe_if_oslo_installed
331- def get_dataset (dataset_id : Union [int , str ], download_data : bool = True ) -> OpenMLDataset :
374+ def get_dataset (
375+ dataset_id : Union [int , str ],
376+ download_data : bool = True ,
377+ version : int = None ,
378+ error_if_multiple : bool = False
379+ ) -> OpenMLDataset :
332380 """ Download the OpenML dataset representation, optionally also download actual data file.
333381
334382 This function is thread/multiprocessing safe.
335383 This function uses caching. A check will be performed to determine if the information has
336384 previously been downloaded, and if so be loaded from disk instead of retrieved from the server.
337385
386+ If dataset is retrieved by name, a version may be specified.
387+ If no version is specified and multiple versions of the dataset exist,
388+ the earliest version of the dataset that is still active will be returned.
389+ This scenario will raise an error instead if `exception_if_multiple` is `True`.
390+
338391 Parameters
339392 ----------
340393 dataset_id : int or str
@@ -344,16 +397,24 @@ def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> Open
344397 make the operation noticeably slower. Metadata is also still retrieved.
345398 If False, create the OpenMLDataset and only populate it with the metadata.
346399 The data may later be retrieved through the `OpenMLDataset.get_data` method.
400+ version : int, optional (default=None)
401+ Specifies the version if `dataset_id` is specified by name.
402+ If no version is specified, retrieve the least recent still active version.
403+ error_if_multiple : bool, optional (default=False)
404+ If `True` raise an error if multiple datasets are found with matching criteria.
347405
348406 Returns
349407 -------
350408 dataset : :class:`openml.OpenMLDataset`
351409 The downloaded dataset."""
352- try :
353- dataset_id = int (dataset_id )
354- except (ValueError , TypeError ):
355- raise ValueError ("Dataset ID is neither an Integer nor can be "
356- "cast to an Integer." )
410+ if isinstance (dataset_id , str ):
411+ try :
412+ dataset_id = int (dataset_id )
413+ except ValueError :
414+ dataset_id = _name_to_id (dataset_id , version , error_if_multiple ) # type: ignore
415+ elif not isinstance (dataset_id , int ):
416+ raise TypeError ("`dataset_id` must be one of `str` or `int`, not {}."
417+ .format (type (dataset_id )))
357418
358419 did_cache_dir = _create_cache_directory_for_id (
359420 DATASETS_CACHE_DIR_NAME , dataset_id ,
0 commit comments