@@ -219,70 +219,120 @@ def test_check_datasets_active(self):
219219 )
220220 openml .config .server = self .test_server
221221
222+ def _datasets_retrieved_successfully (self , dids , metadata_only = True ):
223+ """ Checks that all files for the given dids have been downloaded.
224+
225+ This includes:
226+ - description
227+ - qualities
228+ - features
229+ - absence of data arff if metadata_only, else it must be present too.
230+ """
231+ for did in dids :
232+ self .assertTrue (os .path .exists (os .path .join (
233+ openml .config .get_cache_directory (), "datasets" , str (did ), "description.xml" )))
234+ self .assertTrue (os .path .exists (os .path .join (
235+ openml .config .get_cache_directory (), "datasets" , str (did ), "qualities.xml" )))
236+ self .assertTrue (os .path .exists (os .path .join (
237+ openml .config .get_cache_directory (), "datasets" , str (did ), "features.xml" )))
238+
239+ data_assert = self .assertFalse if metadata_only else self .assertTrue
240+ data_assert (os .path .exists (os .path .join (
241+ openml .config .get_cache_directory (), "datasets" , str (did ), "dataset.arff" )))
242+
243+ def test__name_to_id_with_deactivated (self ):
244+ """ Check that an activated dataset is returned if an earlier deactivated one exists. """
245+ openml .config .server = self .production_server
246+ # /d/1 was deactivated
247+ self .assertEqual (openml .datasets .functions ._name_to_id ('anneal' ), 2 )
248+ openml .config .server = self .test_server
249+
250+ def test__name_to_id_with_multiple_active (self ):
251+ """ With multiple active datasets, retrieve the least recent active. """
252+ self .assertEqual (openml .datasets .functions ._name_to_id ('iris' ), 128 )
253+
254+ def test__name_to_id_with_version (self ):
255+ """ With multiple active datasets, retrieve the least recent active. """
256+ self .assertEqual (openml .datasets .functions ._name_to_id ('iris' , version = 3 ), 151 )
257+
258+ def test__name_to_id_with_multiple_active_error (self ):
259+ """ With multiple active datasets, retrieve the least recent active. """
260+ self .assertRaisesRegex (
261+ ValueError ,
262+ "Multiple active datasets exist with name iris" ,
263+ openml .datasets .functions ._name_to_id ,
264+ dataset_name = 'iris' ,
265+ error_if_multiple = True
266+ )
267+
268+ def test__name_to_id_name_does_not_exist (self ):
269+ """ With multiple active datasets, retrieve the least recent active. """
270+ self .assertRaisesRegex (
271+ RuntimeError ,
272+ "No active datasets exist with name does_not_exist" ,
273+ openml .datasets .functions ._name_to_id ,
274+ dataset_name = 'does_not_exist'
275+ )
276+
277+ def test__name_to_id_version_does_not_exist (self ):
278+ """ With multiple active datasets, retrieve the least recent active. """
279+ self .assertRaisesRegex (
280+ RuntimeError ,
281+ "No active datasets exist with name iris and version 100000" ,
282+ openml .datasets .functions ._name_to_id ,
283+ dataset_name = 'iris' ,
284+ version = 100000
285+ )
286+
287+ def test_get_datasets_by_name (self ):
288+ # did 1 and 2 on the test server:
289+ dids = ['anneal' , 'kr-vs-kp' ]
290+ datasets = openml .datasets .get_datasets (dids , download_data = False )
291+ self .assertEqual (len (datasets ), 2 )
292+ self ._datasets_retrieved_successfully ([1 , 2 ])
293+
294+ def test_get_datasets_by_mixed (self ):
295+ # did 1 and 2 on the test server:
296+ dids = ['anneal' , 2 ]
297+ datasets = openml .datasets .get_datasets (dids , download_data = False )
298+ self .assertEqual (len (datasets ), 2 )
299+ self ._datasets_retrieved_successfully ([1 , 2 ])
300+
222301 def test_get_datasets (self ):
223302 dids = [1 , 2 ]
224303 datasets = openml .datasets .get_datasets (dids )
225304 self .assertEqual (len (datasets ), 2 )
226- self .assertTrue (os .path .exists (os .path .join (
227- openml .config .get_cache_directory (), "datasets" , "1" , "description.xml" )))
228- self .assertTrue (os .path .exists (os .path .join (
229- openml .config .get_cache_directory (), "datasets" , "2" , "description.xml" )))
230- self .assertTrue (os .path .exists (os .path .join (
231- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
232- self .assertTrue (os .path .exists (os .path .join (
233- openml .config .get_cache_directory (), "datasets" , "2" , "dataset.arff" )))
234- self .assertTrue (os .path .exists (os .path .join (
235- openml .config .get_cache_directory (), "datasets" , "1" , "features.xml" )))
236- self .assertTrue (os .path .exists (os .path .join (
237- openml .config .get_cache_directory (), "datasets" , "2" , "features.xml" )))
238- self .assertTrue (os .path .exists (os .path .join (
239- openml .config .get_cache_directory (), "datasets" , "1" , "qualities.xml" )))
240- self .assertTrue (os .path .exists (os .path .join (
241- openml .config .get_cache_directory (), "datasets" , "2" , "qualities.xml" )))
305+ self ._datasets_retrieved_successfully ([1 , 2 ], metadata_only = False )
242306
243307 def test_get_datasets_lazy (self ):
244308 dids = [1 , 2 ]
245309 datasets = openml .datasets .get_datasets (dids , download_data = False )
246310 self .assertEqual (len (datasets ), 2 )
247- self .assertTrue (os .path .exists (os .path .join (
248- openml .config .get_cache_directory (), "datasets" , "1" , "description.xml" )))
249- self .assertTrue (os .path .exists (os .path .join (
250- openml .config .get_cache_directory (), "datasets" , "2" , "description.xml" )))
251- self .assertTrue (os .path .exists (os .path .join (
252- openml .config .get_cache_directory (), "datasets" , "1" , "features.xml" )))
253- self .assertTrue (os .path .exists (os .path .join (
254- openml .config .get_cache_directory (), "datasets" , "2" , "features.xml" )))
255- self .assertTrue (os .path .exists (os .path .join (
256- openml .config .get_cache_directory (), "datasets" , "1" , "qualities.xml" )))
257- self .assertTrue (os .path .exists (os .path .join (
258- openml .config .get_cache_directory (), "datasets" , "2" , "qualities.xml" )))
259-
260- self .assertFalse (os .path .exists (os .path .join (
261- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
262- self .assertFalse (os .path .exists (os .path .join (
263- openml .config .get_cache_directory (), "datasets" , "2" , "dataset.arff" )))
311+ self ._datasets_retrieved_successfully ([1 , 2 ], metadata_only = True )
264312
265313 datasets [0 ].get_data ()
266- self .assertTrue (os .path .exists (os .path .join (
267- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
268-
269314 datasets [1 ].get_data ()
270- self .assertTrue (os .path .exists (os .path .join (
271- openml .config .get_cache_directory (), "datasets" , "2" , "dataset.arff" )))
315+ self ._datasets_retrieved_successfully ([1 , 2 ], metadata_only = False )
316+
317+ def test_get_dataset_by_name (self ):
318+ dataset = openml .datasets .get_dataset ('anneal' )
319+ self .assertEqual (type (dataset ), OpenMLDataset )
320+ self .assertEqual (dataset .dataset_id , 1 )
321+ self ._datasets_retrieved_successfully ([1 ], metadata_only = False )
322+
323+ self .assertGreater (len (dataset .features ), 1 )
324+ self .assertGreater (len (dataset .qualities ), 4 )
325+
326+ # Issue324 Properly handle private datasets when trying to access them
327+ openml .config .server = self .production_server
328+ self .assertRaises (OpenMLPrivateDatasetError , openml .datasets .get_dataset , 45 )
272329
273330 def test_get_dataset (self ):
274331 # This is the only non-lazy load to ensure default behaviour works.
275332 dataset = openml .datasets .get_dataset (1 )
276333 self .assertEqual (type (dataset ), OpenMLDataset )
277334 self .assertEqual (dataset .name , 'anneal' )
278- self .assertTrue (os .path .exists (os .path .join (
279- openml .config .get_cache_directory (), "datasets" , "1" , "description.xml" )))
280- self .assertTrue (os .path .exists (os .path .join (
281- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
282- self .assertTrue (os .path .exists (os .path .join (
283- openml .config .get_cache_directory (), "datasets" , "1" , "features.xml" )))
284- self .assertTrue (os .path .exists (os .path .join (
285- openml .config .get_cache_directory (), "datasets" , "1" , "qualities.xml" )))
335+ self ._datasets_retrieved_successfully ([1 ], metadata_only = False )
286336
287337 self .assertGreater (len (dataset .features ), 1 )
288338 self .assertGreater (len (dataset .qualities ), 4 )
@@ -295,22 +345,13 @@ def test_get_dataset_lazy(self):
295345 dataset = openml .datasets .get_dataset (1 , download_data = False )
296346 self .assertEqual (type (dataset ), OpenMLDataset )
297347 self .assertEqual (dataset .name , 'anneal' )
298- self .assertTrue (os .path .exists (os .path .join (
299- openml .config .get_cache_directory (), "datasets" , "1" , "description.xml" )))
300- self .assertTrue (os .path .exists (os .path .join (
301- openml .config .get_cache_directory (), "datasets" , "1" , "features.xml" )))
302- self .assertTrue (os .path .exists (os .path .join (
303- openml .config .get_cache_directory (), "datasets" , "1" , "qualities.xml" )))
304-
305- self .assertFalse (os .path .exists (os .path .join (
306- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
348+ self ._datasets_retrieved_successfully ([1 ], metadata_only = True )
307349
308350 self .assertGreater (len (dataset .features ), 1 )
309351 self .assertGreater (len (dataset .qualities ), 4 )
310352
311353 dataset .get_data ()
312- self .assertTrue (os .path .exists (os .path .join (
313- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
354+ self ._datasets_retrieved_successfully ([1 ], metadata_only = False )
314355
315356 # Issue324 Properly handle private datasets when trying to access them
316357 openml .config .server = self .production_server
@@ -321,27 +362,26 @@ def test_get_dataset_lazy_all_functions(self):
321362 dataset = openml .datasets .get_dataset (1 , download_data = False )
322363 # We only tests functions as general integrity is tested by test_get_dataset_lazy
323364
365+ def ensure_absence_of_real_data ():
366+ self .assertFalse (os .path .exists (os .path .join (
367+ openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
368+
324369 tag = 'test_lazy_tag_%d' % random .randint (1 , 1000000 )
325370 dataset .push_tag (tag )
326- self .assertFalse (os .path .exists (os .path .join (
327- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
371+ ensure_absence_of_real_data ()
328372
329373 dataset .remove_tag (tag )
330- self .assertFalse (os .path .exists (os .path .join (
331- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
374+ ensure_absence_of_real_data ()
332375
333376 nominal_indices = dataset .get_features_by_type ('nominal' )
334- self .assertFalse (os .path .exists (os .path .join (
335- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
336377 correct = [0 , 1 , 2 , 5 , 6 , 7 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 ,
337378 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 35 , 36 , 37 , 38 ]
338379 self .assertEqual (nominal_indices , correct )
380+ ensure_absence_of_real_data ()
339381
340382 classes = dataset .retrieve_class_labels ()
341383 self .assertEqual (classes , ['1' , '2' , '3' , '4' , '5' , 'U' ])
342-
343- self .assertFalse (os .path .exists (os .path .join (
344- openml .config .get_cache_directory (), "datasets" , "1" , "dataset.arff" )))
384+ ensure_absence_of_real_data ()
345385
346386 def test_get_dataset_sparse (self ):
347387 dataset = openml .datasets .get_dataset (102 , download_data = False )
0 commit comments