Skip to content

Commit d49c2ac

Browse files
committed
make get_dataset_qualities, get_dataset_description, get_dataset_features private, document dataset functions
1 parent 500f80f commit d49c2ac

4 files changed

Lines changed: 104 additions & 18 deletions

File tree

openml/datasets/__init__.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
from .functions import (list_datasets, list_datasets_by_tag,
2-
check_datasets_active, get_datasets, get_dataset,
3-
_get_dataset_description,
4-
_get_dataset_features, _get_dataset_qualities)
2+
check_datasets_active, get_datasets, get_dataset)
53
from .dataset import OpenMLDataset
64

75
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
8-
'get_datasets_arf', '_get_dataset_features',
9-
'_get_dataset_qualities', 'OpenMLDataset', 'list_datasets',
10-
'list_datasets_by_tag',
11-
'_get_dataset_description', 'list_datasets']
6+
'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
7+
'list_datasets']

openml/datasets/dataset.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,17 @@ def __eq__(self, other):
110110
return False
111111

112112
def _get_arff(self):
113+
"""Read ARFF file and return decoded arff.
114+
115+
Reads the file referenced in self.data_file.
116+
117+
Returns
118+
-------
119+
arff_string :
120+
Decoded arff.
121+
122+
"""
123+
113124
# TODO: add a partial read method which only returns the attribute
114125
# headers of the corresponding .arff file!
115126

@@ -226,7 +237,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
226237
else:
227238
return rval
228239

229-
def retrieve_class_labels(self):
240+
def _retrieve_class_labels(self):
230241
"""Reads the datasets arff to determine the class-labels, and returns those.
231242
If the task has no class labels (for example a regression problem) it returns None."""
232243
# TODO improve performance, currently reads the whole file
@@ -248,16 +259,28 @@ def publish(self):
248259
249260
Returns
250261
-------
262+
return_code : int
263+
Return code from server
264+
265+
return_value : string
266+
xml return from server
251267
"""
252-
data = {'description': self.to_xml()}
268+
data = {'description': self._to_xml()}
253269
if self.data_file is not None:
254270
return_code, return_value = _perform_api_call(
255271
"/data/", data=data, file_dictionary={'dataset': self.data_file})
256272
else:
257273
return_code, return_value = _perform_api_call("/data/", data=data)
258274
return return_code, return_value
259275

260-
def to_xml(self):
276+
def _to_xml(self):
277+
"""Serialize object to xml for upload
278+
279+
Returns
280+
-------
281+
xml_dataset : string
282+
XML description of the data.
283+
"""
261284
xml_dataset = ('<oml:data_set_description '
262285
'xmlns:oml="http://openml.org/openml">')
263286
props = ['id', 'name', 'version', 'description', 'format', 'creator',

openml/datasets/functions.py

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def list_datasets():
110110
111111
Returns
112112
-------
113-
list
113+
datasets : list of dicts
114114
A list of all datasets. Every dataset is represented by a
115115
dictionary containing the following information: dataset id,
116116
and status. If qualities are calculated for the dataset, some of
@@ -124,7 +124,7 @@ def list_datasets_by_tag(tag):
124124
125125
Returns
126126
-------
127-
list
127+
datasets : list of dicts
128128
A list of all datasets having the given tag. Every dataset is
129129
represented by a dictionary containing the following information:
130130
dataset id, and status. If qualities are calculated for the dataset,
@@ -174,7 +174,7 @@ def check_datasets_active(dids):
174174
A list of integers representing dataset ids.
175175
176176
Returns
177-
-------
177+
active : dict of int to boolean
178178
dict
179179
A dictionary with items {did: active}, where active is a boolean. It
180180
is set to True if the dataset is active.
@@ -202,7 +202,7 @@ def get_datasets(dids):
202202
203203
Returns
204204
-------
205-
list
205+
datasets : list of datasets
206206
A list of dataset objects.
207207
208208
Notes
@@ -282,6 +282,24 @@ def _get_dataset_description(did):
282282

283283

284284
def _get_dataset_arff(did, description=None):
285+
"""Load dataset arff (from cache or download).
286+
287+
Tries to load did from cache. If that fails, uses
288+
``description`` (fetched if none) to download arff.
289+
290+
Parameters
291+
----------
292+
did : int
293+
Dataset ID
294+
295+
description : dictionary
296+
Dataset description dict.
297+
298+
Returns
299+
-------
300+
output_filename : string
301+
Location of arff file.
302+
"""
285303
did_cache_dir = _create_dataset_cache_directory(did)
286304
output_file = os.path.join(did_cache_dir, "dataset.arff")
287305

@@ -308,6 +326,21 @@ def _get_dataset_arff(did, description=None):
308326

309327

310328
def _get_dataset_features(did):
329+
"""API call to get dataset features (cached)
330+
331+
Features are feature descriptions for each column.
332+
(name, index, categorical, ...)
333+
334+
Parameters
335+
----------
336+
did : int
337+
Dataset ID
338+
339+
Returns
340+
-------
341+
features : dict
342+
Dictionary containing dataset feature descriptions.
343+
"""
311344
did_cache_dir = _create_dataset_cache_directory(did)
312345
features_file = os.path.join(did_cache_dir, "features.xml")
313346

@@ -338,6 +371,20 @@ def _get_dataset_features(did):
338371

339372

340373
def _get_dataset_qualities(did):
374+
"""API call to get dataset qualities (cached)
375+
376+
Features are metafeatures (number of features, number of classes, ...)
377+
378+
Parameters
379+
----------
380+
did : int
381+
Dataset ID
382+
383+
Returns
384+
-------
385+
qualities : dict
386+
Dictionary containing dataset qualities.
387+
"""
341388
# Dataset qualities are subject to change and must be fetched every time
342389
did_cache_dir = _create_dataset_cache_directory(did)
343390
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
@@ -362,6 +409,7 @@ def _get_dataset_qualities(did):
362409

363410

364411
def _create_dataset_cache_directory(did):
412+
"""Create a dataset cache directory"""
365413
dataset_cache_dir = os.path.join(config.get_cache_directory(), "datasets", str(did))
366414
try:
367415
os.makedirs(dataset_cache_dir)
@@ -372,6 +420,7 @@ def _create_dataset_cache_directory(did):
372420

373421

374422
def _remove_dataset_chache_dir(did):
423+
"""Remove the dataset cache directory"""
375424
dataset_cache_dir = os.path.join(config.get_cache_directory(), "datasets", str(did))
376425
try:
377426
os.rmdir(dataset_cache_dir)
@@ -381,6 +430,20 @@ def _remove_dataset_chache_dir(did):
381430

382431

383432
def _create_dataset_from_description(description, arff_file):
433+
"""Create a dataset object from a description dict.
434+
435+
Parameters
436+
----------
437+
description : dict
438+
Description of a dataset in xmlish dict.
439+
arff_file : string
440+
Path of dataset arff file.
441+
442+
Returns
443+
-------
444+
dataset : dataset object
445+
Dataset object from dict and arff.
446+
"""
384447
dataset = OpenMLDataset(
385448
description["oml:id"],
386449
description["oml:name"],

tests/test_datasets.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212
from openml.util import is_string
1313
from openml.testing import TestBase
1414

15-
from openml.datasets.functions import _get_cached_dataset, _get_cached_datasets
15+
from openml.datasets.functions import (_get_cached_dataset,
16+
_get_cached_datasets,
17+
_get_dataset_description,
18+
_get_dataset_features,
19+
_get_dataset_qualities)
1620

1721

1822
class TestOpenMLDataset(TestBase):
@@ -112,17 +116,17 @@ def test_download_rowid(self):
112116
def test__get_dataset_description(self):
113117
# Only a smoke test, I don't know exactly how to test the URL
114118
# retrieval and "caching"
115-
description = openml.datasets._get_dataset_description(2)
119+
description = _get_dataset_description(2)
116120
self.assertIsInstance(description, dict)
117121

118122
def test__get_dataset_features(self):
119123
# Only a smoke check
120-
features = openml.datasets._get_dataset_features(2)
124+
features = _get_dataset_features(2)
121125
self.assertIsInstance(features, dict)
122126

123127
def test__get_dataset_qualities(self):
124128
# Only a smoke check
125-
qualities = openml.datasets._get_dataset_qualities(2)
129+
qualities = _get_dataset_qualities(2)
126130
self.assertIsInstance(qualities, dict)
127131

128132
def test_publish_dataset(self):

0 commit comments

Comments
 (0)