Skip to content

Commit 44c4e3b

Browse files
committed
Merge pull request #8 from mfeurer/master
Merge changes
2 parents 52d3260 + 23ab3c2 commit 44c4e3b

18 files changed

Lines changed: 20477 additions & 164 deletions

openml/apiconnector.py

Lines changed: 153 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
logger = logging.getLogger(__name__)
2929

30-
OPENML_URL = "http://openml.org"
30+
OPENML_URL = "http://www.openml.org"
3131

3232

3333
class OpenMLStatusChange(Warning):
@@ -54,6 +54,10 @@ class AuthentificationError(PyOpenMLError):
5454
def __init__(self, message):
5555
super(AuthentificationError, self).__init__(message)
5656

57+
class OpenMLCacheException(PyOpenMLError):
58+
def __init__(self, message):
59+
super(OpenMLCacheException, self).__init__(message)
60+
5761

5862
class APIConnector(object):
5963
"""
@@ -99,6 +103,10 @@ class APIConnector(object):
99103
`here <http://pieces.openpolitics.com
100104
/2012/04/python-logging-best-practices/>`_.
101105
106+
private_directory : str, optional (default=None)
107+
A local directory which can be accessed through the OpenML package.
108+
Useful to access private datasets through the same interface.
109+
102110
Raises
103111
------
104112
ValueError
@@ -119,7 +127,7 @@ class APIConnector(object):
119127
"""
120128
def __init__(self, cache_directory=None, username=None, password=None,
121129
server=None, verbosity=None, configure_logger=True,
122-
authenticate=True):
130+
authenticate=True, private_directory=None):
123131
# The .openml directory is necessary, just try to create it (EAFP)
124132
try:
125133
os.mkdir(os.path.expanduser('~/.openml'))
@@ -139,6 +147,8 @@ def __init__(self, cache_directory=None, username=None, password=None,
139147
self.config.set('FAKE_SECTION', 'server', server)
140148
if verbosity is not None:
141149
self.config.set('FAKE_SECTION', 'verbosity', verbosity)
150+
if private_directory is not None:
151+
self.config.set('FAKE_SECTION', 'private_directory', private_directory)
142152

143153
if configure_logger:
144154
verbosity = self.config.getint('FAKE_SECTION', 'verbosity')
@@ -162,8 +172,18 @@ def __init__(self, cache_directory=None, username=None, password=None,
162172
self.dataset_cache_dir = os.path.join(self.cache_dir, "datasets")
163173
self.task_cache_dir = os.path.join(self.cache_dir, "tasks")
164174

175+
# Set up the private directory
176+
self.private_directory = self.config.get('FAKE_SECTION',
177+
'private_directory')
178+
self._private_directory_datasets = os.path.join(
179+
self.private_directory, "datasets")
180+
self._private_directory_tasks = os.path.join(
181+
self.private_directory, "tasks")
182+
165183
for dir_ in [self.cache_dir, self.dataset_cache_dir,
166-
self.task_cache_dir]:
184+
self.task_cache_dir, self.private_directory,
185+
self._private_directory_datasets,
186+
self._private_directory_tasks]:
167187
if not os.path.exists(dir_) and not os.path.isdir(dir_):
168188
os.mkdir(dir_)
169189

@@ -213,7 +233,8 @@ def _parse_config(self):
213233
'password': '',
214234
'server': OPENML_URL,
215235
'verbosity': 0,
216-
'cachedir': os.path.expanduser('~/.openml/cache')}
236+
'cachedir': os.path.expanduser('~/.openml/cache'),
237+
'private_directory': os.path.expanduser('~/.openml/private')}
217238

218239
config_file = os.path.expanduser('~/.openml/config')
219240
config = configparser.RawConfigParser(defaults=defaults)
@@ -243,26 +264,30 @@ def _parse_config(self):
243264
# Local getters/accessors to the cache directory
244265
def get_list_of_cached_datasets(self):
245266
"""Return list with ids of all cached datasets"""
246-
directory_content = os.listdir(self.dataset_cache_dir)
247-
directory_content.sort()
248-
249-
# Find all dataset ids for which we have downloaded the dataset
250-
# description
251267
datasets = []
252-
for directory_name in directory_content:
253-
# First check if the directory name could be an OpenML dataset id
254-
if not re.match(r"[0-9]*", directory_name):
255-
continue
256268

257-
did = int(directory_name)
269+
for dataset_cache_dir in [self.dataset_cache_dir,
270+
self._private_directory_datasets]:
271+
directory_content = os.listdir(dataset_cache_dir)
272+
directory_content.sort()
258273

259-
directory_name = os.path.join(self.dataset_cache_dir,
260-
directory_name)
261-
dataset_directory_content = os.listdir(directory_name)
274+
# Find all dataset ids for which we have downloaded the dataset
275+
# description
262276

263-
if "dataset.arff" in dataset_directory_content and \
264-
"description.xml" in dataset_directory_content:
265-
datasets.append(did)
277+
for directory_name in directory_content:
278+
# First check if the directory name could be an OpenML dataset id
279+
if not re.match(r"[0-9]*", directory_name):
280+
continue
281+
282+
did = int(directory_name)
283+
284+
directory_name = os.path.join(dataset_cache_dir,
285+
directory_name)
286+
dataset_directory_content = os.listdir(directory_name)
287+
288+
if "dataset.arff" in dataset_directory_content and \
289+
"description.xml" in dataset_directory_content:
290+
datasets.append(did)
266291

267292
datasets.sort()
268293
return datasets
@@ -281,57 +306,117 @@ def get_cached_datasets(self):
281306

282307
def get_cached_dataset(self, did):
283308
# This code is slow...replace it with new API calls
284-
description = self.download_dataset_description(did)
285-
arff_file = self.download_dataset_arff(did, description=description)
309+
description = self._get_cached_dataset_description(did)
310+
arff_file = self._get_cached_dataset_arff(did)
286311
dataset = self._create_dataset_from_description(description, arff_file)
287312

288313
return dataset
289314

290-
def get_cached_tasks(self):
291-
directory_content = os.listdir(self.task_cache_dir)
292-
directory_content.sort()
315+
def _get_cached_dataset_description(self, did):
316+
for dataset_cache_dir in [self.dataset_cache_dir,
317+
self._private_directory_datasets]:
318+
did_cache_dir = os.path.join(dataset_cache_dir, str(did))
319+
description_file = os.path.join(did_cache_dir, "description.xml")
320+
321+
try:
322+
with open(description_file) as fh:
323+
dataset_xml = fh.read()
324+
except (IOError, OSError) as e:
325+
continue
326+
327+
return xmltodict.parse(dataset_xml)["oml:data_set_description"]
328+
329+
raise OpenMLCacheException("Dataset description for did %d not "
330+
"cached" % did)
331+
332+
333+
def _get_cached_dataset_arff(self, did):
334+
for dataset_cache_dir in [self.dataset_cache_dir,
335+
self._private_directory_datasets]:
336+
did_cache_dir = os.path.join(dataset_cache_dir, str(did))
337+
output_file = os.path.join(did_cache_dir, "dataset.arff")
338+
339+
try:
340+
with open(output_file):
341+
pass
342+
return output_file
343+
except (OSError, IOError) as e:
344+
# TODO create NOTCACHEDEXCEPTION
345+
continue
346+
347+
print("Dataset ID", did)
348+
raise Exception()
349+
293350

294-
# Find all dataset ids for which we have downloaded the dataset
295-
# description
351+
def get_cached_tasks(self):
296352
tasks = OrderedDict()
297-
for filename in directory_content:
298-
match = re.match(r"(tid)_([0-9]*)\.xml", filename)
299-
if match:
300-
tid = match.group(2)
301-
tid = int(tid)
353+
for task_cache_dir in [self.task_cache_dir,
354+
self._private_directory_tasks]:
355+
356+
directory_content = os.listdir(task_cache_dir)
357+
directory_content.sort()
358+
359+
# Find all dataset ids for which we have downloaded the dataset
360+
# description
302361

303-
tasks[tid] = self.get_cached_task(tid)
362+
for filename in directory_content:
363+
match = re.match(r"(tid)_([0-9]*)\.xml", filename)
364+
if match:
365+
tid = match.group(2)
366+
tid = int(tid)
367+
368+
tasks[tid] = self.get_cached_task(tid)
304369

305370
return tasks
306371

307372
def get_cached_task(self, tid):
308-
task_file = os.path.join(self.task_cache_dir,
309-
"tid_%d.xml" % int(tid))
373+
for task_cache_dir in [self.task_cache_dir,
374+
self._private_directory_tasks]:
375+
task_file = os.path.join(task_cache_dir,
376+
"tid_%d.xml" % int(tid))
310377

311-
with open(task_file) as fh:
312-
task = self._create_task_from_xml(xml=fh.read())
313-
return task
378+
try:
379+
with open(task_file) as fh:
380+
task = self._create_task_from_xml(xml=fh.read())
381+
return task
382+
except (OSError, IOError) as e:
383+
continue
314384

315-
def get_cached_splits(self):
316-
directory_content = os.listdir(self.task_cache_dir)
317-
directory_content.sort()
385+
print("Task ID", tid)
386+
raise Exception()
318387

388+
def get_cached_splits(self):
319389
splits = OrderedDict()
320-
for filename in directory_content:
321-
match = re.match(r"(tid)_([0-9]*)\.arff", filename)
322-
if match:
323-
tid = match.group(2)
324-
tid = int(tid)
390+
for task_cache_dir in [self.task_cache_dir,
391+
self._private_directory_tasks]:
392+
directory_content = os.listdir(task_cache_dir)
393+
directory_content.sort()
325394

326-
splits[tid] = self.get_cached_task(tid)
395+
396+
for filename in directory_content:
397+
match = re.match(r"(tid)_([0-9]*)\.arff", filename)
398+
if match:
399+
tid = match.group(2)
400+
tid = int(tid)
401+
402+
splits[tid] = self.get_cached_task(tid)
327403

328404
return splits
329405

330406
def get_cached_split(self, tid):
331-
split_file = os.path.join(self.task_cache_dir,
332-
"tid_%d.arff" % int(tid))
333-
split = OpenMLSplit.from_arff_file(split_file)
334-
return split
407+
for task_cache_dir in [self.task_cache_dir,
408+
self._private_directory_tasks]:
409+
try:
410+
split_file = os.path.join(task_cache_dir,
411+
"tid_%d.arff" % int(tid))
412+
split = OpenMLSplit.from_arff_file(split_file)
413+
return split
414+
415+
except (OSError, IOError) as e:
416+
continue
417+
418+
print("Task ID", tid)
419+
raise Exception()
335420

336421
############################################################################
337422
# Remote getters/API calls to OpenML
@@ -462,14 +547,14 @@ def download_dataset_description(self, did):
462547
description_file = os.path.join(did_cache_dir, "description.xml")
463548

464549
try:
465-
with open(description_file) as fh:
466-
dataset_xml = fh.read()
467-
except (IOError, OSError):
550+
return self._get_cached_dataset_description(did)
551+
except (OpenMLCacheException):
468552
try:
469553
return_code, dataset_xml = self._perform_api_call(
470554
"openml.data.description", data_id=did)
471-
except URLError as e:
555+
except (URLError, UnicodeEncodeError) as e:
472556
# TODO logger.debug
557+
self._remove_dataset_chache_dir(did)
473558
print(e)
474559
raise e
475560

@@ -481,6 +566,7 @@ def download_dataset_description(self, did):
481566
"oml:data_set_description"]
482567
except Exception as e:
483568
# TODO logger.debug
569+
self._remove_dataset_chache_dir()
484570
print("Dataset ID", did)
485571
raise e
486572

@@ -526,7 +612,7 @@ def download_dataset_features(self, did):
526612
try:
527613
return_code, features_xml = self._perform_api_call(
528614
"openml.data.features", data_id=did)
529-
except URLError as e:
615+
except (URLError, UnicodeEncodeError) as e:
530616
# TODO logger.debug
531617
print(e)
532618
raise e
@@ -550,7 +636,7 @@ def download_dataset_qualities(self, did):
550636
try:
551637
return_code, qualities_xml = self._perform_api_call(
552638
"openml.data.qualities", data_id=did)
553-
except URLError as e:
639+
except (URLError, UnicodeEncodeError) as e:
554640
# TODO logger.debug
555641
print(e)
556642
raise e
@@ -575,6 +661,14 @@ def _create_dataset_cache_dir(self, did):
575661
pass
576662
return dataset_cache_dir
577663

664+
def _remove_dataset_chache_dir(self, did):
665+
dataset_cache_dir = os.path.join(self.dataset_cache_dir, str(did))
666+
try:
667+
os.rmdir(dataset_cache_dir)
668+
except (OSError, IOError):
669+
# TODO add debug information
670+
pass
671+
578672
def _create_dataset_from_description(self, description, arff_file):
579673
dataset = OpenMLDataset(
580674
description["oml:id"],
@@ -680,7 +774,7 @@ def download_task(self, task_id):
680774
try:
681775
return_code, task_xml = self._perform_api_call(
682776
"openml.task.search", task_id=task_id)
683-
except URLError as e:
777+
except (URLError, UnicodeEncodeError) as e:
684778
print(e)
685779
raise e
686780

@@ -764,7 +858,7 @@ def _download_split(self, task, cache_file):
764858
split_url = task.estimation_procedure["data_splits_url"]
765859
try:
766860
return_code, split_arff = self._read_url(split_url)
767-
except URLError as e:
861+
except (URLError, UnicodeEncodeError) as e:
768862
print(e, split_url)
769863
raise e
770864

0 commit comments

Comments
 (0)