Skip to content

Commit 68f51a9

Browse files
authored
Allow tasks to be downloaded without dataqualities (#1086)
* Allow tasks to be downloaded without dataqualities Previously ``download_qualities`` would be left at the default of True with no way to overwrite it. * Deprecate the use of strings for identifying tasks
1 parent 0b786e4 commit 68f51a9

3 files changed

Lines changed: 30 additions & 16 deletions

File tree

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Changelog
1111

1212
* ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests.
1313
* ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment.
14+
* ADD: You can now avoid downloading 'qualities' meta-data when downloading a task with the ``download_qualities`` parameter of ``openml.tasks.get_task[s]`` functions.
1415
* DOC: Fixes a few broken links in the documentation.
1516
* MAINT: Rename `master` brach to ` main` branch.
1617
* MAINT/DOC: Automatically check for broken external links when building the documentation.

openml/datasets/functions.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -370,20 +370,22 @@ def get_dataset(
370370
----------
371371
dataset_id : int or str
372372
Dataset ID of the dataset to download
373-
download_data : bool, optional (default=True)
373+
download_data : bool (default=True)
374374
If True, also download the data file. Beware that some datasets are large and it might
375375
make the operation noticeably slower. Metadata is also still retrieved.
376376
If False, create the OpenMLDataset and only populate it with the metadata.
377377
The data may later be retrieved through the `OpenMLDataset.get_data` method.
378378
version : int, optional (default=None)
379379
Specifies the version if `dataset_id` is specified by name.
380380
If no version is specified, retrieve the least recent still active version.
381-
error_if_multiple : bool, optional (default=False)
381+
error_if_multiple : bool (default=False)
382382
If ``True`` raise an error if multiple datasets are found with matching criteria.
383-
cache_format : str, optional (default='pickle')
383+
cache_format : str (default='pickle')
384384
Format for caching the dataset - may be feather or pickle
385385
Note that the default 'pickle' option may load slower than feather when
386386
no.of.rows is very high.
387+
download_qualities : bool (default=True)
388+
Option to download 'qualities' meta-data in addition to the minimal dataset description.
387389
Returns
388390
-------
389391
dataset : :class:`openml.OpenMLDataset`

openml/tasks/functions.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# License: BSD 3-Clause
2-
2+
import warnings
33
from collections import OrderedDict
44
import io
55
import re
66
import os
7-
from typing import Union, Dict, Optional
7+
from typing import Union, Dict, Optional, List
88

99
import pandas as pd
1010
import xmltodict
@@ -297,30 +297,36 @@ def __list_tasks(api_call, output_format="dict"):
297297
return tasks
298298

299299

300-
def get_tasks(task_ids, download_data=True):
300+
def get_tasks(
301+
task_ids: List[int], download_data: bool = True, download_qualities: bool = True
302+
) -> List[OpenMLTask]:
301303
"""Download tasks.
302304
303305
This function iterates :meth:`openml.tasks.get_task`.
304306
305307
Parameters
306308
----------
307-
task_ids : iterable
308-
Integers/Strings representing task ids.
309-
download_data : bool
309+
task_ids : List[int]
310+
A list of task ids to download.
311+
download_data : bool (default = True)
310312
Option to trigger download of data along with the meta data.
313+
download_qualities : bool (default=True)
314+
Option to download 'qualities' meta-data in addition to the minimal dataset description.
311315
312316
Returns
313317
-------
314318
list
315319
"""
316320
tasks = []
317321
for task_id in task_ids:
318-
tasks.append(get_task(task_id, download_data))
322+
tasks.append(get_task(task_id, download_data, download_qualities))
319323
return tasks
320324

321325

322326
@openml.utils.thread_safe_if_oslo_installed
323-
def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
327+
def get_task(
328+
task_id: int, download_data: bool = True, download_qualities: bool = True
329+
) -> OpenMLTask:
324330
"""Download OpenML task for a given task ID.
325331
326332
Downloads the task representation, while the data splits can be
@@ -329,25 +335,30 @@ def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
329335
330336
Parameters
331337
----------
332-
task_id : int or str
333-
The OpenML task id.
334-
download_data : bool
338+
task_id : int
339+
The OpenML task id of the task to download.
340+
download_data : bool (default=True)
335341
Option to trigger download of data along with the meta data.
342+
download_qualities : bool (default=True)
343+
Option to download 'qualities' meta-data in addition to the minimal dataset description.
336344
337345
Returns
338346
-------
339347
task
340348
"""
349+
if not isinstance(task_id, int):
350+
warnings.warn("Task id must be specified as `int` from 0.14.0 onwards.", DeprecationWarning)
351+
341352
try:
342353
task_id = int(task_id)
343354
except (ValueError, TypeError):
344-
raise ValueError("Dataset ID is neither an Integer nor can be " "cast to an Integer.")
355+
raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.")
345356

346357
tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id,)
347358

348359
try:
349360
task = _get_task_description(task_id)
350-
dataset = get_dataset(task.dataset_id, download_data)
361+
dataset = get_dataset(task.dataset_id, download_data, download_qualities=download_qualities)
351362
# List of class labels availaible in dataset description
352363
# Including class labels as part of task meta data handles
353364
# the case where data download was initially disabled

0 commit comments

Comments
 (0)