Skip to content

Commit 8aea41c

Browse files
committed
Add private test directory
1 parent 91e88d5 commit 8aea41c

1 file changed

Lines changed: 127 additions & 74 deletions

File tree

openml/apiconnector.py

Lines changed: 127 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ class AuthentificationError(PyOpenMLError):
5454
def __init__(self, message):
5555
super(AuthentificationError, self).__init__(message)
5656

57+
class OpenMLCacheException(PyOpenMLError):
58+
def __init__(self, message):
59+
super(OpenMLCacheException, self).__init__(message)
60+
5761

5862
class APIConnector(object):
5963
"""
@@ -99,6 +103,10 @@ class APIConnector(object):
99103
`here <http://pieces.openpolitics.com
100104
/2012/04/python-logging-best-practices/>`_.
101105
106+
private_directory : str, optional (default=None)
107+
A local directory which can be accessed through the OpenML package.
108+
Useful to access private datasets through the same interface.
109+
102110
Raises
103111
------
104112
ValueError
@@ -119,7 +127,7 @@ class APIConnector(object):
119127
"""
120128
def __init__(self, cache_directory=None, username=None, password=None,
121129
server=None, verbosity=None, configure_logger=True,
122-
authenticate=True):
130+
authenticate=True, private_directory=None):
123131
# The .openml directory is necessary, just try to create it (EAFP)
124132
try:
125133
os.mkdir(os.path.expanduser('~/.openml'))
@@ -139,6 +147,8 @@ def __init__(self, cache_directory=None, username=None, password=None,
139147
self.config.set('FAKE_SECTION', 'server', server)
140148
if verbosity is not None:
141149
self.config.set('FAKE_SECTION', 'verbosity', verbosity)
150+
if private_directory is not None:
151+
self.config.set('FAKE_SECTION', 'private_directory', private_directory)
142152

143153
if configure_logger:
144154
verbosity = self.config.getint('FAKE_SECTION', 'verbosity')
@@ -162,8 +172,18 @@ def __init__(self, cache_directory=None, username=None, password=None,
162172
self.dataset_cache_dir = os.path.join(self.cache_dir, "datasets")
163173
self.task_cache_dir = os.path.join(self.cache_dir, "tasks")
164174

175+
# Set up the private directory
176+
self.private_directory = self.config.get('FAKE_SECTION',
177+
'private_directory')
178+
self._private_directory_datasets = os.path.join(
179+
self.private_directory, "datasets")
180+
self._private_directory_tasks = os.path.join(
181+
self.private_directory, "tasks")
182+
165183
for dir_ in [self.cache_dir, self.dataset_cache_dir,
166-
self.task_cache_dir]:
184+
self.task_cache_dir, self.private_directory,
185+
self._private_directory_datasets,
186+
self._private_directory_tasks]:
167187
if not os.path.exists(dir_) and not os.path.isdir(dir_):
168188
os.mkdir(dir_)
169189

@@ -213,7 +233,8 @@ def _parse_config(self):
213233
'password': '',
214234
'server': OPENML_URL,
215235
'verbosity': 0,
216-
'cachedir': os.path.expanduser('~/.openml/cache')}
236+
'cachedir': os.path.expanduser('~/.openml/cache'),
237+
'private_directory': os.path.expanduser('~/.openml/private')}
217238

218239
config_file = os.path.expanduser('~/.openml/config')
219240
config = configparser.RawConfigParser(defaults=defaults)
@@ -243,26 +264,30 @@ def _parse_config(self):
243264
# Local getters/accessors to the cache directory
244265
def get_list_of_cached_datasets(self):
245266
"""Return list with ids of all cached datasets"""
246-
directory_content = os.listdir(self.dataset_cache_dir)
247-
directory_content.sort()
248-
249-
# Find all dataset ids for which we have downloaded the dataset
250-
# description
251267
datasets = []
252-
for directory_name in directory_content:
253-
# First check if the directory name could be an OpenML dataset id
254-
if not re.match(r"[0-9]*", directory_name):
255-
continue
256268

257-
did = int(directory_name)
269+
for dataset_cache_dir in [self.dataset_cache_dir,
270+
self._private_directory_datasets]:
271+
directory_content = os.listdir(dataset_cache_dir)
272+
directory_content.sort()
273+
274+
# Find all dataset ids for which we have downloaded the dataset
275+
# description
258276

259-
directory_name = os.path.join(self.dataset_cache_dir,
260-
directory_name)
261-
dataset_directory_content = os.listdir(directory_name)
277+
for directory_name in directory_content:
278+
# First check if the directory name could be an OpenML dataset id
279+
if not re.match(r"[0-9]*", directory_name):
280+
continue
262281

263-
if "dataset.arff" in dataset_directory_content and \
264-
"description.xml" in dataset_directory_content:
265-
datasets.append(did)
282+
did = int(directory_name)
283+
284+
directory_name = os.path.join(dataset_cache_dir,
285+
directory_name)
286+
dataset_directory_content = os.listdir(directory_name)
287+
288+
if "dataset.arff" in dataset_directory_content and \
289+
"description.xml" in dataset_directory_content:
290+
datasets.append(did)
266291

267292
datasets.sort()
268293
return datasets
@@ -288,82 +313,110 @@ def get_cached_dataset(self, did):
288313
return dataset
289314

290315
def _get_cached_dataset_description(self, did):
291-
did_cache_dir = os.path.join(self.dataset_cache_dir, str(did))
292-
description_file = os.path.join(did_cache_dir, "description.xml")
316+
for dataset_cache_dir in [self.dataset_cache_dir,
317+
self._private_directory_datasets]:
318+
did_cache_dir = os.path.join(dataset_cache_dir, str(did))
319+
description_file = os.path.join(did_cache_dir, "description.xml")
293320

294-
try:
295-
with open(description_file) as fh:
296-
dataset_xml = fh.read()
297-
except (IOError, OSError) as e:
298-
# TODO create NOTCACHEDEXCEPTION
299-
print(e)
300-
raise e
321+
try:
322+
with open(description_file) as fh:
323+
dataset_xml = fh.read()
324+
except (IOError, OSError) as e:
325+
continue
326+
327+
return xmltodict.parse(dataset_xml)["oml:data_set_description"]
328+
329+
raise OpenMLCacheException("Dataset description for did %d not "
330+
"cached" % did)
301331

302-
try:
303-
return xmltodict.parse(dataset_xml)["oml:data_set_description"]
304-
except Exception as e:
305-
# TODO logger.debug; create CACHEEXCEPTION
306-
print("Dataset ID", did)
307-
raise e
308332

309333
def _get_cached_dataset_arff(self, did):
310-
did_cache_dir = os.path.join(self.dataset_cache_dir, str(did))
311-
output_file = os.path.join(did_cache_dir, "dataset.arff")
334+
for dataset_cache_dir in [self.dataset_cache_dir,
335+
self._private_directory_datasets]:
336+
did_cache_dir = os.path.join(dataset_cache_dir, str(did))
337+
output_file = os.path.join(did_cache_dir, "dataset.arff")
312338

313-
try:
314-
with open(output_file):
315-
pass
316-
return output_file
317-
except (OSError, IOError) as e:
318-
# TODO create NOTCACHEDEXCEPTION
319-
print(e)
320-
raise e
339+
try:
340+
with open(output_file):
341+
pass
342+
return output_file
343+
except (OSError, IOError) as e:
344+
# TODO create NOTCACHEDEXCEPTION
345+
continue
321346

322-
def get_cached_tasks(self):
323-
directory_content = os.listdir(self.task_cache_dir)
324-
directory_content.sort()
347+
print("Dataset ID", did)
348+
raise Exception()
325349

326-
# Find all dataset ids for which we have downloaded the dataset
327-
# description
350+
351+
def get_cached_tasks(self):
328352
tasks = OrderedDict()
329-
for filename in directory_content:
330-
match = re.match(r"(tid)_([0-9]*)\.xml", filename)
331-
if match:
332-
tid = match.group(2)
333-
tid = int(tid)
353+
for task_cache_dir in [self.task_cache_dir,
354+
self._private_directory_tasks]:
355+
356+
directory_content = os.listdir(task_cache_dir)
357+
directory_content.sort()
334358

335-
tasks[tid] = self.get_cached_task(tid)
359+
# Find all dataset ids for which we have downloaded the dataset
360+
# description
361+
362+
for filename in directory_content:
363+
match = re.match(r"(tid)_([0-9]*)\.xml", filename)
364+
if match:
365+
tid = match.group(2)
366+
tid = int(tid)
367+
368+
tasks[tid] = self.get_cached_task(tid)
336369

337370
return tasks
338371

339372
def get_cached_task(self, tid):
340-
task_file = os.path.join(self.task_cache_dir,
341-
"tid_%d.xml" % int(tid))
373+
for task_cache_dir in [self.task_cache_dir,
374+
self._private_directory_tasks]:
375+
task_file = os.path.join(task_cache_dir,
376+
"tid_%d.xml" % int(tid))
342377

343-
with open(task_file) as fh:
344-
task = self._create_task_from_xml(xml=fh.read())
345-
return task
378+
try:
379+
with open(task_file) as fh:
380+
task = self._create_task_from_xml(xml=fh.read())
381+
return task
382+
except (OSError, IOError) as e:
383+
continue
346384

347-
def get_cached_splits(self):
348-
directory_content = os.listdir(self.task_cache_dir)
349-
directory_content.sort()
385+
print("Task ID", tid)
386+
raise Exception()
350387

388+
def get_cached_splits(self):
351389
splits = OrderedDict()
352-
for filename in directory_content:
353-
match = re.match(r"(tid)_([0-9]*)\.arff", filename)
354-
if match:
355-
tid = match.group(2)
356-
tid = int(tid)
390+
for task_cache_dir in [self.task_cache_dir,
391+
self._private_directory_tasks]:
392+
directory_content = os.listdir(task_cache_dir)
393+
directory_content.sort()
394+
395+
396+
for filename in directory_content:
397+
match = re.match(r"(tid)_([0-9]*)\.arff", filename)
398+
if match:
399+
tid = match.group(2)
400+
tid = int(tid)
357401

358-
splits[tid] = self.get_cached_task(tid)
402+
splits[tid] = self.get_cached_task(tid)
359403

360404
return splits
361405

362406
def get_cached_split(self, tid):
363-
split_file = os.path.join(self.task_cache_dir,
364-
"tid_%d.arff" % int(tid))
365-
split = OpenMLSplit.from_arff_file(split_file)
366-
return split
407+
for task_cache_dir in [self.task_cache_dir,
408+
self._private_directory_tasks]:
409+
try:
410+
split_file = os.path.join(task_cache_dir,
411+
"tid_%d.arff" % int(tid))
412+
split = OpenMLSplit.from_arff_file(split_file)
413+
return split
414+
415+
except (OSError, IOError) as e:
416+
continue
417+
418+
print("Task ID", tid)
419+
raise Exception()
367420

368421
############################################################################
369422
# Remote getters/API calls to OpenML
@@ -495,7 +548,7 @@ def download_dataset_description(self, did):
495548

496549
try:
497550
return self._get_cached_dataset_description(did)
498-
except (IOError, OSError):
551+
except (OpenMLCacheException):
499552
try:
500553
return_code, dataset_xml = self._perform_api_call(
501554
"openml.data.description", data_id=did)

0 commit comments

Comments
 (0)