@@ -54,6 +54,10 @@ class AuthentificationError(PyOpenMLError):
5454 def __init__ (self , message ):
5555 super (AuthentificationError , self ).__init__ (message )
5656
57+ class OpenMLCacheException (PyOpenMLError ):
58+ def __init__ (self , message ):
59+ super (OpenMLCacheException , self ).__init__ (message )
60+
5761
5862class APIConnector (object ):
5963 """
@@ -99,6 +103,10 @@ class APIConnector(object):
99103 `here <http://pieces.openpolitics.com
100104 /2012/04/python-logging-best-practices/>`_.
101105
106+ private_directory : str, optional (default=None)
107+ A local directory which can be accessed through the OpenML package.
108+ Useful to access private datasets through the same interface.
109+
102110 Raises
103111 ------
104112 ValueError
@@ -119,7 +127,7 @@ class APIConnector(object):
119127 """
120128 def __init__ (self , cache_directory = None , username = None , password = None ,
121129 server = None , verbosity = None , configure_logger = True ,
122- authenticate = True ):
130+ authenticate = True , private_directory = None ):
123131 # The .openml directory is necessary, just try to create it (EAFP)
124132 try :
125133 os .mkdir (os .path .expanduser ('~/.openml' ))
@@ -139,6 +147,8 @@ def __init__(self, cache_directory=None, username=None, password=None,
139147 self .config .set ('FAKE_SECTION' , 'server' , server )
140148 if verbosity is not None :
141149 self .config .set ('FAKE_SECTION' , 'verbosity' , verbosity )
150+ if private_directory is not None :
151+ self .config .set ('FAKE_SECTION' , 'private_directory' , private_directory )
142152
143153 if configure_logger :
144154 verbosity = self .config .getint ('FAKE_SECTION' , 'verbosity' )
@@ -162,8 +172,18 @@ def __init__(self, cache_directory=None, username=None, password=None,
162172 self .dataset_cache_dir = os .path .join (self .cache_dir , "datasets" )
163173 self .task_cache_dir = os .path .join (self .cache_dir , "tasks" )
164174
175+ # Set up the private directory
176+ self .private_directory = self .config .get ('FAKE_SECTION' ,
177+ 'private_directory' )
178+ self ._private_directory_datasets = os .path .join (
179+ self .private_directory , "datasets" )
180+ self ._private_directory_tasks = os .path .join (
181+ self .private_directory , "tasks" )
182+
165183 for dir_ in [self .cache_dir , self .dataset_cache_dir ,
166- self .task_cache_dir ]:
184+ self .task_cache_dir , self .private_directory ,
185+ self ._private_directory_datasets ,
186+ self ._private_directory_tasks ]:
167187 if not os .path .exists (dir_ ) and not os .path .isdir (dir_ ):
168188 os .mkdir (dir_ )
169189
@@ -213,7 +233,8 @@ def _parse_config(self):
213233 'password' : '' ,
214234 'server' : OPENML_URL ,
215235 'verbosity' : 0 ,
216- 'cachedir' : os .path .expanduser ('~/.openml/cache' )}
236+ 'cachedir' : os .path .expanduser ('~/.openml/cache' ),
237+ 'private_directory' : os .path .expanduser ('~/.openml/private' )}
217238
218239 config_file = os .path .expanduser ('~/.openml/config' )
219240 config = configparser .RawConfigParser (defaults = defaults )
@@ -243,26 +264,30 @@ def _parse_config(self):
243264 # Local getters/accessors to the cache directory
244265 def get_list_of_cached_datasets (self ):
245266 """Return list with ids of all cached datasets"""
246- directory_content = os .listdir (self .dataset_cache_dir )
247- directory_content .sort ()
248-
249- # Find all dataset ids for which we have downloaded the dataset
250- # description
251267 datasets = []
252- for directory_name in directory_content :
253- # First check if the directory name could be an OpenML dataset id
254- if not re .match (r"[0-9]*" , directory_name ):
255- continue
256268
257- did = int (directory_name )
269+ for dataset_cache_dir in [self .dataset_cache_dir ,
270+ self ._private_directory_datasets ]:
271+ directory_content = os .listdir (dataset_cache_dir )
272+ directory_content .sort ()
273+
274+ # Find all dataset ids for which we have downloaded the dataset
275+ # description
258276
259- directory_name = os .path .join (self .dataset_cache_dir ,
260- directory_name )
261- dataset_directory_content = os .listdir (directory_name )
277+ for directory_name in directory_content :
278+ # First check if the directory name could be an OpenML dataset id
279+ if not re .match (r"[0-9]*" , directory_name ):
280+ continue
262281
263- if "dataset.arff" in dataset_directory_content and \
264- "description.xml" in dataset_directory_content :
265- datasets .append (did )
282+ did = int (directory_name )
283+
284+ directory_name = os .path .join (dataset_cache_dir ,
285+ directory_name )
286+ dataset_directory_content = os .listdir (directory_name )
287+
288+ if "dataset.arff" in dataset_directory_content and \
289+ "description.xml" in dataset_directory_content :
290+ datasets .append (did )
266291
267292 datasets .sort ()
268293 return datasets
@@ -288,82 +313,110 @@ def get_cached_dataset(self, did):
288313 return dataset
289314
290315 def _get_cached_dataset_description (self , did ):
291- did_cache_dir = os .path .join (self .dataset_cache_dir , str (did ))
292- description_file = os .path .join (did_cache_dir , "description.xml" )
316+ for dataset_cache_dir in [self .dataset_cache_dir ,
317+ self ._private_directory_datasets ]:
318+ did_cache_dir = os .path .join (dataset_cache_dir , str (did ))
319+ description_file = os .path .join (did_cache_dir , "description.xml" )
293320
294- try :
295- with open (description_file ) as fh :
296- dataset_xml = fh .read ()
297- except (IOError , OSError ) as e :
298- # TODO create NOTCACHEDEXCEPTION
299- print (e )
300- raise e
321+ try :
322+ with open (description_file ) as fh :
323+ dataset_xml = fh .read ()
324+ except (IOError , OSError ) as e :
325+ continue
326+
327+ return xmltodict .parse (dataset_xml )["oml:data_set_description" ]
328+
329+ raise OpenMLCacheException ("Dataset description for did %d not "
330+ "cached" % did )
301331
302- try :
303- return xmltodict .parse (dataset_xml )["oml:data_set_description" ]
304- except Exception as e :
305- # TODO logger.debug; create CACHEEXCEPTION
306- print ("Dataset ID" , did )
307- raise e
308332
309333 def _get_cached_dataset_arff (self , did ):
310- did_cache_dir = os .path .join (self .dataset_cache_dir , str (did ))
311- output_file = os .path .join (did_cache_dir , "dataset.arff" )
334+ for dataset_cache_dir in [self .dataset_cache_dir ,
335+ self ._private_directory_datasets ]:
336+ did_cache_dir = os .path .join (dataset_cache_dir , str (did ))
337+ output_file = os .path .join (did_cache_dir , "dataset.arff" )
312338
313- try :
314- with open (output_file ):
315- pass
316- return output_file
317- except (OSError , IOError ) as e :
318- # TODO create NOTCACHEDEXCEPTION
319- print (e )
320- raise e
339+ try :
340+ with open (output_file ):
341+ pass
342+ return output_file
343+ except (OSError , IOError ) as e :
344+ # TODO create NOTCACHEDEXCEPTION
345+ continue
321346
322- def get_cached_tasks (self ):
323- directory_content = os .listdir (self .task_cache_dir )
324- directory_content .sort ()
347+ print ("Dataset ID" , did )
348+ raise Exception ()
325349
326- # Find all dataset ids for which we have downloaded the dataset
327- # description
350+
351+ def get_cached_tasks ( self ):
328352 tasks = OrderedDict ()
329- for filename in directory_content :
330- match = re . match ( r"(tid)_([0-9]*)\.xml" , filename )
331- if match :
332- tid = match . group ( 2 )
333- tid = int ( tid )
353+ for task_cache_dir in [ self . task_cache_dir ,
354+ self . _private_directory_tasks ]:
355+
356+ directory_content = os . listdir ( task_cache_dir )
357+ directory_content . sort ( )
334358
335- tasks [tid ] = self .get_cached_task (tid )
359+ # Find all dataset ids for which we have downloaded the dataset
360+ # description
361+
362+ for filename in directory_content :
363+ match = re .match (r"(tid)_([0-9]*)\.xml" , filename )
364+ if match :
365+ tid = match .group (2 )
366+ tid = int (tid )
367+
368+ tasks [tid ] = self .get_cached_task (tid )
336369
337370 return tasks
338371
339372 def get_cached_task (self , tid ):
340- task_file = os .path .join (self .task_cache_dir ,
341- "tid_%d.xml" % int (tid ))
373+ for task_cache_dir in [self .task_cache_dir ,
374+ self ._private_directory_tasks ]:
375+ task_file = os .path .join (task_cache_dir ,
376+ "tid_%d.xml" % int (tid ))
342377
343- with open (task_file ) as fh :
344- task = self ._create_task_from_xml (xml = fh .read ())
345- return task
378+ try :
379+ with open (task_file ) as fh :
380+ task = self ._create_task_from_xml (xml = fh .read ())
381+ return task
382+ except (OSError , IOError ) as e :
383+ continue
346384
347- def get_cached_splits (self ):
348- directory_content = os .listdir (self .task_cache_dir )
349- directory_content .sort ()
385+ print ("Task ID" , tid )
386+ raise Exception ()
350387
388+ def get_cached_splits (self ):
351389 splits = OrderedDict ()
352- for filename in directory_content :
353- match = re .match (r"(tid)_([0-9]*)\.arff" , filename )
354- if match :
355- tid = match .group (2 )
356- tid = int (tid )
390+ for task_cache_dir in [self .task_cache_dir ,
391+ self ._private_directory_tasks ]:
392+ directory_content = os .listdir (task_cache_dir )
393+ directory_content .sort ()
394+
395+
396+ for filename in directory_content :
397+ match = re .match (r"(tid)_([0-9]*)\.arff" , filename )
398+ if match :
399+ tid = match .group (2 )
400+ tid = int (tid )
357401
358- splits [tid ] = self .get_cached_task (tid )
402+ splits [tid ] = self .get_cached_task (tid )
359403
360404 return splits
361405
362406 def get_cached_split (self , tid ):
363- split_file = os .path .join (self .task_cache_dir ,
364- "tid_%d.arff" % int (tid ))
365- split = OpenMLSplit .from_arff_file (split_file )
366- return split
407+ for task_cache_dir in [self .task_cache_dir ,
408+ self ._private_directory_tasks ]:
409+ try :
410+ split_file = os .path .join (task_cache_dir ,
411+ "tid_%d.arff" % int (tid ))
412+ split = OpenMLSplit .from_arff_file (split_file )
413+ return split
414+
415+ except (OSError , IOError ) as e :
416+ continue
417+
418+ print ("Task ID" , tid )
419+ raise Exception ()
367420
368421 ############################################################################
369422 # Remote getters/API calls to OpenML
@@ -495,7 +548,7 @@ def download_dataset_description(self, did):
495548
496549 try :
497550 return self ._get_cached_dataset_description (did )
498- except (IOError , OSError ):
551+ except (OpenMLCacheException ):
499552 try :
500553 return_code , dataset_xml = self ._perform_api_call (
501554 "openml.data.description" , data_id = did )
0 commit comments