Skip to content

Commit e1ecfe9

Browse files
varshneydevanshpre-commit-ci[bot]PGijsbersLennartPurucker
authored
fix: carefully replaced minio_url with parquet_url (#1280)
* carefully replaced minio with parquet * fix: corrected some mistakes * fix: restored the instances of minio * fix: updated the documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add #1280 I used a `next` header instead of a specific version since we don't know if it will be 0.15.0 or 0.14.2. We can change it before the next release. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl> Co-authored-by: Lennart Purucker <lennart.purucker@uni-siegen.de>
1 parent 7e69d04 commit e1ecfe9

4 files changed

Lines changed: 24 additions & 17 deletions

File tree

doc/progress.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
Changelog
77
=========
88

9+
next
10+
~~~~~~
11+
12+
* MAINT #1280: Use the server-provided ``parquet_url`` instead of ``minio_url`` to determine the location of the parquet file.
13+
914
0.14.1
1015
~~~~~~
1116

openml/datasets/dataset.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,12 @@ class OpenMLDataset(OpenMLBase):
9696
which maps a quality name to a quality value.
9797
dataset: string, optional
9898
Serialized arff dataset string.
99-
minio_url: string, optional
100-
URL to the MinIO bucket with dataset files
99+
parquet_url: string, optional
100+
This is the URL to the storage location where the dataset files are hosted.
101+
This can be a MinIO bucket URL. If specified, the data will be accessed
102+
from this URL when reading the files.
101103
parquet_file: string, optional
102-
Path to the local parquet file.
104+
Path to the local file.
103105
"""
104106

105107
def __init__(
@@ -132,7 +134,7 @@ def __init__(
132134
features_file: Optional[str] = None,
133135
qualities_file: Optional[str] = None,
134136
dataset=None,
135-
minio_url: Optional[str] = None,
137+
parquet_url: Optional[str] = None,
136138
parquet_file: Optional[str] = None,
137139
):
138140
def find_invalid_characters(string, pattern):
@@ -210,7 +212,7 @@ def find_invalid_characters(string, pattern):
210212
self.data_file = data_file
211213
self.parquet_file = parquet_file
212214
self._dataset = dataset
213-
self._minio_url = minio_url
215+
self._parquet_url = parquet_url
214216

215217
self._features = None # type: Optional[Dict[int, OpenMLDataFeature]]
216218
self._qualities = None # type: Optional[Dict[str, float]]
@@ -329,7 +331,7 @@ def _download_data(self) -> None:
329331
from .functions import _get_dataset_arff, _get_dataset_parquet
330332

331333
self.data_file = _get_dataset_arff(self)
332-
if self._minio_url is not None:
334+
if self._parquet_url is not None:
333335
self.parquet_file = _get_dataset_parquet(self)
334336

335337
def _get_arff(self, format: str) -> Dict:

openml/datasets/functions.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ def get_dataset(
495495
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
496496

497497
arff_file = _get_dataset_arff(description) if download_data else None
498-
if "oml:minio_url" in description and download_data:
498+
if "oml:parquet_url" in description and download_data:
499499
try:
500500
parquet_file = _get_dataset_parquet(
501501
description, download_all_files=download_all_files
@@ -1062,18 +1062,18 @@ def _get_dataset_parquet(
10621062
10631063
download_all_files: bool, optional (default=False)
10641064
If `True`, download all data found in the bucket to which the description's
1065-
``minio_url`` points, only download the parquet file otherwise.
1065+
``parquet_url`` points, only download the parquet file otherwise.
10661066
10671067
Returns
10681068
-------
10691069
output_filename : string, optional
10701070
Location of the Parquet file if successfully downloaded, None otherwise.
10711071
"""
10721072
if isinstance(description, dict):
1073-
url = cast(str, description.get("oml:minio_url"))
1073+
url = cast(str, description.get("oml:parquet_url"))
10741074
did = description.get("oml:id")
10751075
elif isinstance(description, OpenMLDataset):
1076-
url = cast(str, description._minio_url)
1076+
url = cast(str, description._parquet_url)
10771077
did = description.dataset_id
10781078
else:
10791079
raise TypeError("`description` should be either OpenMLDataset or Dict.")
@@ -1316,7 +1316,7 @@ def _create_dataset_from_description(
13161316
cache_format=cache_format,
13171317
features_file=features_file,
13181318
qualities_file=qualities_file,
1319-
minio_url=description.get("oml:minio_url"),
1319+
parquet_url=description.get("oml:parquet_url"),
13201320
parquet_file=parquet_file,
13211321
)
13221322

tests/test_datasets/test_dataset_functions.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
439439

440440
def test__get_dataset_parquet_not_cached(self):
441441
description = {
442-
"oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
442+
"oml:parquet_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
443443
"oml:id": "20",
444444
}
445445
path = _get_dataset_parquet(description, cache_directory=self.workdir)
@@ -450,10 +450,10 @@ def test__get_dataset_parquet_not_cached(self):
450450
def test__get_dataset_parquet_is_cached(self, patch):
451451
openml.config.set_root_cache_directory(self.static_cache_dir)
452452
patch.side_effect = RuntimeError(
453-
"_download_minio_file should not be called when loading from cache"
453+
"_download_parquet_url should not be called when loading from cache"
454454
)
455455
description = {
456-
"oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
456+
"oml:parquet_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
457457
"oml:id": "30",
458458
}
459459
path = _get_dataset_parquet(description, cache_directory=None)
@@ -462,7 +462,7 @@ def test__get_dataset_parquet_is_cached(self, patch):
462462

463463
def test__get_dataset_parquet_file_does_not_exist(self):
464464
description = {
465-
"oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
465+
"oml:parquet_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
466466
"oml:id": "20",
467467
}
468468
path = _get_dataset_parquet(description, cache_directory=self.workdir)
@@ -1416,7 +1416,7 @@ def test_get_dataset_cache_format_feather(self):
14161416
# The parquet file on minio with ID 128 is not the iris dataset from the test server.
14171417
dataset = openml.datasets.get_dataset(128, cache_format="feather")
14181418
# Workaround
1419-
dataset._minio_url = None
1419+
dataset._parquet_url = None
14201420
dataset.parquet_file = None
14211421
dataset.get_data()
14221422

@@ -1561,7 +1561,7 @@ def test_get_dataset_parquet(self):
15611561
# There is no parquet-copy of the test server yet.
15621562
openml.config.server = self.production_server
15631563
dataset = openml.datasets.get_dataset(61)
1564-
self.assertIsNotNone(dataset._minio_url)
1564+
self.assertIsNotNone(dataset._parquet_url)
15651565
self.assertIsNotNone(dataset.parquet_file)
15661566
self.assertTrue(os.path.isfile(dataset.parquet_file))
15671567

0 commit comments

Comments
 (0)