Skip to content

Commit cd71051

Browse files
committed
allow parallel testing of datasets.functions
1 parent 67880dd commit cd71051

4 files changed

Lines changed: 44 additions & 21 deletions

File tree

openml/config.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@
1818
cachedir = ""
1919

2020

21-
22-
23-
2421
def _setup():
2522
"""Setup openml package. Called on first import.
2623
@@ -71,7 +68,7 @@ def set_cache_directory(cachedir):
7168
dataset_cache_dir = os.path.join(cachedir, "datasets")
7269
task_cache_dir = os.path.join(cachedir, "tasks")
7370
run_cache_dir = os.path.join(cachedir, 'runs')
74-
71+
lock_dir = os.path.join(cachedir, 'locks')
7572

7673
for dir_ in [cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir]:
7774
if not os.path.exists(dir_) and not os.path.isdir(dir_):

openml/datasets/functions.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import re
55
import shutil
66

7+
from oslo_concurrency import lockutils
78
import xmltodict
89

910
from .dataset import OpenMLDataset
@@ -259,6 +260,8 @@ def get_dataset(dataset_id):
259260
260261
TODO: explain caching!
261262
263+
This function is thread/multiprocessing safe.
264+
262265
Parameters
263266
----------
264267
ddataset_id : int
@@ -274,24 +277,32 @@ def get_dataset(dataset_id):
274277
raise ValueError("Dataset ID is neither an Integer nor can be "
275278
"cast to an Integer.")
276279

277-
did_cache_dir = _create_dataset_cache_directory(dataset_id)
278-
279-
try:
280-
description = _get_dataset_description(did_cache_dir, dataset_id)
281-
arff_file = _get_dataset_arff(did_cache_dir, description)
282-
features = _get_dataset_features(did_cache_dir, dataset_id)
283-
# TODO not used yet, figure out what to do with this...
284-
qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
285-
except Exception as e:
286-
_remove_dataset_cache_dir(did_cache_dir)
287-
raise e
280+
with lockutils.external_lock(
281+
name='datasets.functions.get_dataset:%d' % dataset_id,
282+
lock_path=os.path.join(config.get_cache_directory(), 'locks'),
283+
):
284+
did_cache_dir = _create_dataset_cache_directory(dataset_id)
288285

289-
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
286+
try:
287+
description = _get_dataset_description(did_cache_dir, dataset_id)
288+
arff_file = _get_dataset_arff(did_cache_dir, description)
289+
features = _get_dataset_features(did_cache_dir, dataset_id)
290+
# TODO not used yet, figure out what to do with this...
291+
qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
292+
except Exception as e:
293+
_remove_dataset_cache_dir(did_cache_dir)
294+
raise e
295+
296+
dataset = _create_dataset_from_description(
297+
description, features, qualities, arff_file
298+
)
290299
return dataset
291300

292301

293302
def _get_dataset_description(did_cache_dir, dataset_id):
294-
"""Get the dataset description as xml dictionary
303+
"""Get the dataset description as xml dictionary.
304+
305+
This function is NOT thread/multiprocessing safe.
295306
296307
Parameters
297308
----------
@@ -337,6 +348,8 @@ def _get_dataset_arff(did_cache_dir, description):
337348
Checks if the file is in the cache, if yes, return the path to the file. If
338349
not, downloads the file and caches it, then returns the file path.
339350
351+
This function is NOT thread/multiprocessing safe.
352+
340353
Parameters
341354
----------
342355
did_cache_dir : str
@@ -377,6 +390,8 @@ def _get_dataset_features(did_cache_dir, dataset_id):
377390
Features are feature descriptions for each column.
378391
(name, index, categorical, ...)
379392
393+
This function is NOT thread/multiprocessing safe.
394+
380395
Parameters
381396
----------
382397
did_cache_dir : str
@@ -412,6 +427,8 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
412427
413428
Features are metafeatures (number of features, number of classes, ...)
414429
430+
This function is NOT thread/multiprocessing safe.
431+
415432
Parameters
416433
----------
417434
did_cache_dir : str
@@ -449,6 +466,8 @@ def _create_dataset_cache_directory(dataset_id):
449466
is a directory for each dataset witch the dataset ID being the directory
450467
name. This function creates this cache directory.
451468
469+
This function is NOT thread/multiprocessing safe.
470+
452471
Parameters
453472
----------
454473
did : int
@@ -471,6 +490,8 @@ def _create_dataset_cache_directory(dataset_id):
471490
def _remove_dataset_cache_dir(did_cache_dir):
472491
"""Remove the dataset cache directory
473492
493+
This function is NOT thread/multiprocessing safe.
494+
474495
Parameters
475496
----------
476497
"""

openml/testing.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class TestBase(unittest.TestCase):
1919
Hopefully soon allows using a test server, not the production server.
2020
"""
2121

22-
def setUp(self):
22+
def setUp(self, tmp_dir_name=None):
2323
# This cache directory is checked in to git to simulate a populated
2424
# cache
2525
self.maxDiff = None
@@ -36,7 +36,9 @@ def setUp(self):
3636

3737
self.cwd = os.getcwd()
3838
workdir = os.path.dirname(os.path.abspath(__file__))
39-
self.workdir = os.path.join(workdir, "tmp")
39+
if tmp_dir_name is None:
40+
tmp_dir_name = 'tmp'
41+
self.workdir = os.path.join(workdir, tmp_dir_name)
4042
try:
4143
shutil.rmtree(self.workdir)
4244
except:

tests/test_datasets/test_dataset_functions.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@
2727

2828

2929
class TestOpenMLDataset(TestBase):
30+
_multiprocess_can_split_ = True
3031

31-
def setUp(self):
32-
super(TestOpenMLDataset, self).setUp()
32+
def setUp(self, tmp_dir_name=None):
33+
tmp_dir_name = self.id()
34+
print(tmp_dir_name)
35+
super(TestOpenMLDataset, self).setUp(tmp_dir_name=tmp_dir_name)
3336
self._remove_did1()
3437

3538
def tearDown(self):

0 commit comments

Comments
 (0)