2727
2828logger = logging .getLogger (__name__ )
2929
30- OPENML_URL = "http://openml.org"
30+ OPENML_URL = "http://www. openml.org"
3131
3232
3333class OpenMLStatusChange (Warning ):
@@ -54,6 +54,10 @@ class AuthentificationError(PyOpenMLError):
5454 def __init__ (self , message ):
5555 super (AuthentificationError , self ).__init__ (message )
5656
57+ class OpenMLCacheException (PyOpenMLError ):
58+ def __init__ (self , message ):
59+ super (OpenMLCacheException , self ).__init__ (message )
60+
5761
5862class APIConnector (object ):
5963 """
@@ -99,6 +103,10 @@ class APIConnector(object):
99103 `here <http://pieces.openpolitics.com
100104 /2012/04/python-logging-best-practices/>`_.
101105
106+ private_directory : str, optional (default=None)
107+ A local directory which can be accessed through the OpenML package.
108+ Useful to access private datasets through the same interface.
109+
102110 Raises
103111 ------
104112 ValueError
@@ -119,7 +127,7 @@ class APIConnector(object):
119127 """
120128 def __init__ (self , cache_directory = None , username = None , password = None ,
121129 server = None , verbosity = None , configure_logger = True ,
122- authenticate = True ):
130+ authenticate = True , private_directory = None ):
123131 # The .openml directory is necessary, just try to create it (EAFP)
124132 try :
125133 os .mkdir (os .path .expanduser ('~/.openml' ))
@@ -139,6 +147,8 @@ def __init__(self, cache_directory=None, username=None, password=None,
139147 self .config .set ('FAKE_SECTION' , 'server' , server )
140148 if verbosity is not None :
141149 self .config .set ('FAKE_SECTION' , 'verbosity' , verbosity )
150+ if private_directory is not None :
151+ self .config .set ('FAKE_SECTION' , 'private_directory' , private_directory )
142152
143153 if configure_logger :
144154 verbosity = self .config .getint ('FAKE_SECTION' , 'verbosity' )
@@ -162,8 +172,18 @@ def __init__(self, cache_directory=None, username=None, password=None,
162172 self .dataset_cache_dir = os .path .join (self .cache_dir , "datasets" )
163173 self .task_cache_dir = os .path .join (self .cache_dir , "tasks" )
164174
175+ # Set up the private directory
176+ self .private_directory = self .config .get ('FAKE_SECTION' ,
177+ 'private_directory' )
178+ self ._private_directory_datasets = os .path .join (
179+ self .private_directory , "datasets" )
180+ self ._private_directory_tasks = os .path .join (
181+ self .private_directory , "tasks" )
182+
165183 for dir_ in [self .cache_dir , self .dataset_cache_dir ,
166- self .task_cache_dir ]:
184+ self .task_cache_dir , self .private_directory ,
185+ self ._private_directory_datasets ,
186+ self ._private_directory_tasks ]:
167187 if not os .path .exists (dir_ ) and not os .path .isdir (dir_ ):
168188 os .mkdir (dir_ )
169189
@@ -213,7 +233,8 @@ def _parse_config(self):
213233 'password' : '' ,
214234 'server' : OPENML_URL ,
215235 'verbosity' : 0 ,
216- 'cachedir' : os .path .expanduser ('~/.openml/cache' )}
236+ 'cachedir' : os .path .expanduser ('~/.openml/cache' ),
237+ 'private_directory' : os .path .expanduser ('~/.openml/private' )}
217238
218239 config_file = os .path .expanduser ('~/.openml/config' )
219240 config = configparser .RawConfigParser (defaults = defaults )
@@ -243,26 +264,30 @@ def _parse_config(self):
243264 # Local getters/accessors to the cache directory
244265 def get_list_of_cached_datasets (self ):
245266 """Return list with ids of all cached datasets"""
246- directory_content = os .listdir (self .dataset_cache_dir )
247- directory_content .sort ()
248-
249- # Find all dataset ids for which we have downloaded the dataset
250- # description
251267 datasets = []
252- for directory_name in directory_content :
253- # First check if the directory name could be an OpenML dataset id
254- if not re .match (r"[0-9]*" , directory_name ):
255- continue
256268
257- did = int (directory_name )
269+ for dataset_cache_dir in [self .dataset_cache_dir ,
270+ self ._private_directory_datasets ]:
271+ directory_content = os .listdir (dataset_cache_dir )
272+ directory_content .sort ()
258273
259- directory_name = os .path .join (self .dataset_cache_dir ,
260- directory_name )
261- dataset_directory_content = os .listdir (directory_name )
274+ # Find all dataset ids for which we have downloaded the dataset
275+ # description
262276
263- if "dataset.arff" in dataset_directory_content and \
264- "description.xml" in dataset_directory_content :
265- datasets .append (did )
277+ for directory_name in directory_content :
278+ # First check if the directory name could be an OpenML dataset id
279+ if not re .match (r"[0-9]*" , directory_name ):
280+ continue
281+
282+ did = int (directory_name )
283+
284+ directory_name = os .path .join (dataset_cache_dir ,
285+ directory_name )
286+ dataset_directory_content = os .listdir (directory_name )
287+
288+ if "dataset.arff" in dataset_directory_content and \
289+ "description.xml" in dataset_directory_content :
290+ datasets .append (did )
266291
267292 datasets .sort ()
268293 return datasets
@@ -281,57 +306,117 @@ def get_cached_datasets(self):
281306
282307 def get_cached_dataset (self , did ):
283308 # This code is slow...replace it with new API calls
284- description = self .download_dataset_description (did )
285- arff_file = self .download_dataset_arff (did , description = description )
309+ description = self ._get_cached_dataset_description (did )
310+ arff_file = self ._get_cached_dataset_arff (did )
286311 dataset = self ._create_dataset_from_description (description , arff_file )
287312
288313 return dataset
289314
290- def get_cached_tasks (self ):
291- directory_content = os .listdir (self .task_cache_dir )
292- directory_content .sort ()
315+ def _get_cached_dataset_description (self , did ):
316+ for dataset_cache_dir in [self .dataset_cache_dir ,
317+ self ._private_directory_datasets ]:
318+ did_cache_dir = os .path .join (dataset_cache_dir , str (did ))
319+ description_file = os .path .join (did_cache_dir , "description.xml" )
320+
321+ try :
322+ with open (description_file ) as fh :
323+ dataset_xml = fh .read ()
324+ except (IOError , OSError ) as e :
325+ continue
326+
327+ return xmltodict .parse (dataset_xml )["oml:data_set_description" ]
328+
329+ raise OpenMLCacheException ("Dataset description for did %d not "
330+ "cached" % did )
331+
332+
333+ def _get_cached_dataset_arff (self , did ):
334+ for dataset_cache_dir in [self .dataset_cache_dir ,
335+ self ._private_directory_datasets ]:
336+ did_cache_dir = os .path .join (dataset_cache_dir , str (did ))
337+ output_file = os .path .join (did_cache_dir , "dataset.arff" )
338+
339+ try :
340+ with open (output_file ):
341+ pass
342+ return output_file
343+ except (OSError , IOError ) as e :
344+ # TODO create NOTCACHEDEXCEPTION
345+ continue
346+
347+ print ("Dataset ID" , did )
348+ raise Exception ()
349+
293350
294- # Find all dataset ids for which we have downloaded the dataset
295- # description
351+ def get_cached_tasks (self ):
296352 tasks = OrderedDict ()
297- for filename in directory_content :
298- match = re .match (r"(tid)_([0-9]*)\.xml" , filename )
299- if match :
300- tid = match .group (2 )
301- tid = int (tid )
353+ for task_cache_dir in [self .task_cache_dir ,
354+ self ._private_directory_tasks ]:
355+
356+ directory_content = os .listdir (task_cache_dir )
357+ directory_content .sort ()
358+
359+ # Find all dataset ids for which we have downloaded the dataset
360+ # description
302361
303- tasks [tid ] = self .get_cached_task (tid )
362+ for filename in directory_content :
363+ match = re .match (r"(tid)_([0-9]*)\.xml" , filename )
364+ if match :
365+ tid = match .group (2 )
366+ tid = int (tid )
367+
368+ tasks [tid ] = self .get_cached_task (tid )
304369
305370 return tasks
306371
307372 def get_cached_task (self , tid ):
308- task_file = os .path .join (self .task_cache_dir ,
309- "tid_%d.xml" % int (tid ))
373+ for task_cache_dir in [self .task_cache_dir ,
374+ self ._private_directory_tasks ]:
375+ task_file = os .path .join (task_cache_dir ,
376+ "tid_%d.xml" % int (tid ))
310377
311- with open (task_file ) as fh :
312- task = self ._create_task_from_xml (xml = fh .read ())
313- return task
378+ try :
379+ with open (task_file ) as fh :
380+ task = self ._create_task_from_xml (xml = fh .read ())
381+ return task
382+ except (OSError , IOError ) as e :
383+ continue
314384
315- def get_cached_splits (self ):
316- directory_content = os .listdir (self .task_cache_dir )
317- directory_content .sort ()
385+ print ("Task ID" , tid )
386+ raise Exception ()
318387
388+ def get_cached_splits (self ):
319389 splits = OrderedDict ()
320- for filename in directory_content :
321- match = re .match (r"(tid)_([0-9]*)\.arff" , filename )
322- if match :
323- tid = match .group (2 )
324- tid = int (tid )
390+ for task_cache_dir in [self .task_cache_dir ,
391+ self ._private_directory_tasks ]:
392+ directory_content = os .listdir (task_cache_dir )
393+ directory_content .sort ()
325394
326- splits [tid ] = self .get_cached_task (tid )
395+
396+ for filename in directory_content :
397+ match = re .match (r"(tid)_([0-9]*)\.arff" , filename )
398+ if match :
399+ tid = match .group (2 )
400+ tid = int (tid )
401+
402+ splits [tid ] = self .get_cached_task (tid )
327403
328404 return splits
329405
330406 def get_cached_split (self , tid ):
331- split_file = os .path .join (self .task_cache_dir ,
332- "tid_%d.arff" % int (tid ))
333- split = OpenMLSplit .from_arff_file (split_file )
334- return split
407+ for task_cache_dir in [self .task_cache_dir ,
408+ self ._private_directory_tasks ]:
409+ try :
410+ split_file = os .path .join (task_cache_dir ,
411+ "tid_%d.arff" % int (tid ))
412+ split = OpenMLSplit .from_arff_file (split_file )
413+ return split
414+
415+ except (OSError , IOError ) as e :
416+ continue
417+
418+ print ("Task ID" , tid )
419+ raise Exception ()
335420
336421 ############################################################################
337422 # Remote getters/API calls to OpenML
@@ -462,14 +547,14 @@ def download_dataset_description(self, did):
462547 description_file = os .path .join (did_cache_dir , "description.xml" )
463548
464549 try :
465- with open (description_file ) as fh :
466- dataset_xml = fh .read ()
467- except (IOError , OSError ):
550+ return self ._get_cached_dataset_description (did )
551+ except (OpenMLCacheException ):
468552 try :
469553 return_code , dataset_xml = self ._perform_api_call (
470554 "openml.data.description" , data_id = did )
471- except URLError as e :
555+ except ( URLError , UnicodeEncodeError ) as e :
472556 # TODO logger.debug
557+ self ._remove_dataset_chache_dir (did )
473558 print (e )
474559 raise e
475560
@@ -481,6 +566,7 @@ def download_dataset_description(self, did):
481566 "oml:data_set_description" ]
482567 except Exception as e :
483568 # TODO logger.debug
569+ self ._remove_dataset_chache_dir ()
484570 print ("Dataset ID" , did )
485571 raise e
486572
@@ -526,7 +612,7 @@ def download_dataset_features(self, did):
526612 try :
527613 return_code , features_xml = self ._perform_api_call (
528614 "openml.data.features" , data_id = did )
529- except URLError as e :
615+ except ( URLError , UnicodeEncodeError ) as e :
530616 # TODO logger.debug
531617 print (e )
532618 raise e
@@ -550,7 +636,7 @@ def download_dataset_qualities(self, did):
550636 try :
551637 return_code , qualities_xml = self ._perform_api_call (
552638 "openml.data.qualities" , data_id = did )
553- except URLError as e :
639+ except ( URLError , UnicodeEncodeError ) as e :
554640 # TODO logger.debug
555641 print (e )
556642 raise e
@@ -575,6 +661,14 @@ def _create_dataset_cache_dir(self, did):
575661 pass
576662 return dataset_cache_dir
577663
664+ def _remove_dataset_chache_dir (self , did ):
665+ dataset_cache_dir = os .path .join (self .dataset_cache_dir , str (did ))
666+ try :
667+ os .rmdir (dataset_cache_dir )
668+ except (OSError , IOError ):
669+ # TODO add debug information
670+ pass
671+
578672 def _create_dataset_from_description (self , description , arff_file ):
579673 dataset = OpenMLDataset (
580674 description ["oml:id" ],
@@ -680,7 +774,7 @@ def download_task(self, task_id):
680774 try :
681775 return_code , task_xml = self ._perform_api_call (
682776 "openml.task.search" , task_id = task_id )
683- except URLError as e :
777+ except ( URLError , UnicodeEncodeError ) as e :
684778 print (e )
685779 raise e
686780
@@ -764,7 +858,7 @@ def _download_split(self, task, cache_file):
764858 split_url = task .estimation_procedure ["data_splits_url" ]
765859 try :
766860 return_code , split_arff = self ._read_url (split_url )
767- except URLError as e :
861+ except ( URLError , UnicodeEncodeError ) as e :
768862 print (e , split_url )
769863 raise e
770864
0 commit comments