44import re
55import shutil
66
7+ from oslo_concurrency import lockutils
78import xmltodict
89
910from .dataset import OpenMLDataset
@@ -259,6 +260,8 @@ def get_dataset(dataset_id):
259260
260261 TODO: explain caching!
261262
263+ This function is thread/multiprocessing safe.
264+
262265 Parameters
263266 ----------
264267 ddataset_id : int
@@ -274,24 +277,32 @@ def get_dataset(dataset_id):
274277 raise ValueError ("Dataset ID is neither an Integer nor can be "
275278 "cast to an Integer." )
276279
277- did_cache_dir = _create_dataset_cache_directory (dataset_id )
278-
279- try :
280- description = _get_dataset_description (did_cache_dir , dataset_id )
281- arff_file = _get_dataset_arff (did_cache_dir , description )
282- features = _get_dataset_features (did_cache_dir , dataset_id )
283- # TODO not used yet, figure out what to do with this...
284- qualities = _get_dataset_qualities (did_cache_dir , dataset_id )
285- except Exception as e :
286- _remove_dataset_cache_dir (did_cache_dir )
287- raise e
280+ with lockutils .external_lock (
281+ name = 'datasets.functions.get_dataset:%d' % dataset_id ,
282+ lock_path = os .path .join (config .get_cache_directory (), 'locks' ),
283+ ):
284+ did_cache_dir = _create_dataset_cache_directory (dataset_id )
288285
289- dataset = _create_dataset_from_description (description , features , qualities , arff_file )
286+ try :
287+ description = _get_dataset_description (did_cache_dir , dataset_id )
288+ arff_file = _get_dataset_arff (did_cache_dir , description )
289+ features = _get_dataset_features (did_cache_dir , dataset_id )
290+ # TODO not used yet, figure out what to do with this...
291+ qualities = _get_dataset_qualities (did_cache_dir , dataset_id )
292+ except Exception as e :
293+ _remove_dataset_cache_dir (did_cache_dir )
294+ raise e
295+
296+ dataset = _create_dataset_from_description (
297+ description , features , qualities , arff_file
298+ )
290299 return dataset
291300
292301
293302def _get_dataset_description (did_cache_dir , dataset_id ):
294- """Get the dataset description as xml dictionary
303+ """Get the dataset description as xml dictionary.
304+
305+ This function is NOT thread/multiprocessing safe.
295306
296307 Parameters
297308 ----------
@@ -337,6 +348,8 @@ def _get_dataset_arff(did_cache_dir, description):
337348 Checks if the file is in the cache, if yes, return the path to the file. If
338349 not, downloads the file and caches it, then returns the file path.
339350
351+ This function is NOT thread/multiprocessing safe.
352+
340353 Parameters
341354 ----------
342355 did_cache_dir : str
@@ -377,6 +390,8 @@ def _get_dataset_features(did_cache_dir, dataset_id):
377390 Features are feature descriptions for each column.
378391 (name, index, categorical, ...)
379392
393+ This function is NOT thread/multiprocessing safe.
394+
380395 Parameters
381396 ----------
382397 did_cache_dir : str
@@ -412,6 +427,8 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
412427
413428 Features are metafeatures (number of features, number of classes, ...)
414429
430+ This function is NOT thread/multiprocessing safe.
431+
415432 Parameters
416433 ----------
417434 did_cache_dir : str
@@ -449,6 +466,8 @@ def _create_dataset_cache_directory(dataset_id):
449466 is a directory for each dataset witch the dataset ID being the directory
450467 name. This function creates this cache directory.
451468
469+ This function is NOT thread/multiprocessing safe.
470+
452471 Parameters
453472 ----------
454473 did : int
@@ -471,6 +490,8 @@ def _create_dataset_cache_directory(dataset_id):
471490def _remove_dataset_cache_dir (did_cache_dir ):
472491 """Remove the dataset cache directory
473492
493+ This function is NOT thread/multiprocessing safe.
494+
474495 Parameters
475496 ----------
476497 """
0 commit comments