usability improvements

amueller · amueller · commit 500f80f3b277 · 2016-03-18T13:41:47.000+01:00
better error on task xml parsing
rename OpenMLDataset.get_dataset to OpenMLDataset.get_data
Some docstrings
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -5,63 +5,6 @@
 from . import config
 
 
-"""
-Provides an interface to the OpenML server.
-
-All parameters of the APIConnector can be either specified in a config
-file or when creating this object. The config file must be placed in a
-directory ``.openml`` inside the users home directory and have the name
-``config``. If one of the parameters is specified by passing it to the
-constructor of this class, it will override the value specified in the
-configuration file.
-
-Parameters
-----------
-cache_directory : string, optional (default=None)
-    A local directory which will be used for caching. If this is not set, a
-    directory '.openml/cache' in the users home directory will be used.
-    If either directory does not exist, it will be created.
-
-apikey : string, optional (default=None)
-    Your OpenML API key which will be used to authenticate you at the OpenML
-    server.
-
-server : string, optional (default=None)
-    The OpenML server to connect to.
-
-verbosity : int, optional (default=None)
-
-configure_logger : bool (default=True)
-    Whether the python logging module should be configured by the openml
-    package. If set to true, this is a very basic configuration,
-    which only prints to the standard output. This is only recommended
-    for testing or small problems. It is set to True to adhere to the
-    `specifications of the OpenML client API
-    <https://github.com/openml/OpenML/wiki/Client-API>`_.
-    When the openml module is used as a library, it is recommended that
-    the main application controls the logging level, e.g. see
-    `here <http://pieces.openpolitics.com
-    /2012/04/python-logging-best-practices/>`_.
-
-private_directory : str, optional (default=None)
-    A local directory which can be accessed through the OpenML package.
-    Useful to access private datasets through the same interface.
-
-Raises
-------
-ValueError
-    If apikey is neither specified in the config nor given as an argument.
-OpenMLServerError
-    If the OpenML server returns an unexptected response.
-
-Notes
------
-Testing the API calls in Firefox is possible with the Firefox AddOn
-HTTPRequestor.
-
-"""
-
-
 def _perform_api_call(call, data=None, file_dictionary=None,
                       file_elements=None, add_authentication=True):
     """
diff --git a/openml/config.py b/openml/config.py
@@ -1,3 +1,7 @@
+"""
+Stores module level information like the API key, cache director, private
+directory and the server.
+"""
 import os
 import sys
 import logging
@@ -21,6 +25,16 @@
 
 
 def _setup():
+    """Setup openml package. Called on first import.
+
+    Reads the config file and sets up apikey, server, cache appropriately.
+    key and server can be set by the user simply using
+    openml.config.apikey = THEIRKEY
+    openml.config.server = SOMESERVER
+    The cache dir needs to be set up calling set_cache_directory
+    because it needs some setup.
+    We could also make it a property but that's less clear.
+    """
     global apikey
     global server
     # read config file, create cache directory
@@ -38,6 +52,24 @@ def _setup():
 
 
 def set_cache_directory(cachedir, privatedir):
+    """Set module-wide cache directory.
+
+    Sets the cache directory into which to download datasets, tasks etc.
+    Also sets the private directory for storing local datasets.
+
+    Parameters
+    ----------
+    cachedir : string
+        Path to use as cache directory.
+
+    privatedir : string
+        Path containing private datasets, tasks, etc.
+
+    See also
+    --------
+    get_cache_directory
+    get_private_directory
+    """
     global _cachedir
     global _privatedir
     _cachedir = cachedir
@@ -67,6 +99,8 @@ def set_cache_directory(cachedir, privatedir):
 
 
 def _parse_config():
+    """Parse the config file, set up defaults.
+    """
     defaults = {'apikey': apikey,
                 'server': server,
                 'verbosity': 0,
@@ -99,10 +133,34 @@ def _parse_config():
 
 
 def get_cache_directory():
+    """Get the current cache directory.
+
+    Returns
+    -------
+    cachedir : string
+        The current cache directory.
+
+    See also
+    --------
+    set_cache_directory
+    get_private_directory
+    """
     return _cachedir
 
 
 def get_private_directory():
+    """Get the current private directory.
+
+    Returns
+    -------
+    privatecir : string
+        The current private directory.
+
+    See also
+    --------
+    set_cache_directory
+    get_cache_directory
+    """
     return _privatedir
 
 __all__ = ["set_cache_directory", 'get_cache_directory', 'get_private_directory']
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -22,7 +22,18 @@
 
 
 class OpenMLDataset(object):
-
+    """Dataset object.
+
+    Allows fetching and uploading datasets to OpenML.
+
+    Parameters
+    ----------
+    name : string
+        Name of the dataset
+    description : string
+        Description of the dataset
+    FIXME : which of these do we actually nee?
+    """
     def __init__(self, id=None, name=None, version=None, description=None,
                  format=None, creator=None, contributor=None,
                  collection_date=None, upload_date=None, language=None,
@@ -63,7 +74,7 @@ def __init__(self, id=None, name=None, version=None, description=None,
                 logger.debug("Data pickle file already exists.")
             else:
                 try:
-                    data = self.get_arff()
+                    data = self._get_arff()
                 except OSError as e:
                     logger.critical("Please check that the data file %s is there "
                                     "and can be read.", self.data_file)
@@ -98,9 +109,7 @@ def __eq__(self, other):
         else:
             return False
 
-    ##########################################################################
-    # ARFF related stuff
-    def get_arff(self):
+    def _get_arff(self):
         # TODO: add a partial read method which only returns the attribute
         # headers of the corresponding .arff file!
 
@@ -124,11 +133,20 @@ def decode_arff(fh):
             with open(filename) as fh:
                 return decode_arff(fh)
 
-    ##########################################################################
-    def get_dataset(self, target=None, target_dtype=int, include_row_id=False,
-                    include_ignore_attributes=False,
-                    return_categorical_indicator=False,
-                    return_attribute_names=False):
+    def get_data(self, target=None, target_dtype=int, include_row_id=False,
+                 include_ignore_attributes=False,
+                 return_categorical_indicator=False,
+                 return_attribute_names=False):
+        """Returns dataset content as numpy arrays / sparse matrices.
+
+        Parameters
+        ----------
+
+
+        Returns
+        -------
+
+        """
         rval = []
 
         path = self.data_pickle_file
@@ -224,6 +242,13 @@ def retrieve_class_labels(self):
             return None
 
     def publish(self):
+        """Publish the dataset on the OpenML server.
+
+        Upload the dataset description and dataset content to openml.
+
+        Returns
+        -------
+        """
         data = {'description': self.to_xml()}
         if self.data_file is not None:
             return_code, return_value = _perform_api_call(
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -77,9 +77,9 @@ def publish(self):
         description_xml = self.create_description_xml()
         data = {'predictions': predictions, 'description':
                 description_xml}
-        return_code, dataset_xml = _perform_api_call(
+        return_code, return_value = _perform_api_call(
             "/run/", file_elements=data)
-        return return_code, dataset_xml
+        return return_code, return_value
 
     def create_description_xml(self):
         run_environment = _get_version_information()
@@ -311,7 +311,7 @@ def _create_run_from_xml(xml):
         raise ValueError('No URL to download predictions for run %d in run '
                          'description XML' % run_id)
     evaluations = dict()
-    detailed_evaluations = defaultdict(lambda : defaultdict(dict))
+    detailed_evaluations = defaultdict(lambda: defaultdict(dict))
     evaluation_flows = dict()
     for evaluation_dict in run['oml:output_data']['oml:evaluation']:
         key = evaluation_dict['oml:name']
diff --git a/openml/tasks/task_functions.py b/openml/tasks/task_functions.py
@@ -154,33 +154,35 @@ def _list_tasks(api_call):
                          '"oml:runs"/@xmlns:oml is not '
                          '"http://openml.org/openml": %s'
                          % str(tasks_dict))
-
-    tasks = []
-    procs = get_estimation_procedure_list()
-    proc_dict = dict((x['id'], x) for x in procs)
-    for task_ in tasks_dict['oml:tasks']['oml:task']:
-        task = {'tid': int(task_['oml:task_id']),
-                'did': int(task_['oml:did']),
-                'name': task_['oml:name'],
-                'task_type': task_['oml:task_type'],
-                'status': task_['oml:status']}
-
-        # Other task inputs
-        for input in task_.get('oml:input', list()):
-            if input['@name'] == 'estimation_procedure':
-                task[input['@name']] = proc_dict[int(input['#text'])]['name']
-            else:
-                value = input.get('#text')
-                task[input['@name']] = value
-
-        task[input['@name']] = input['#text']
-
-        # The number of qualities can range from 0 to infinity
-        for quality in task_.get('oml:quality', list()):
-            quality['#text'] = float(quality['#text'])
-            if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
-                quality['#text'] = int(quality['#text'])
-            task[quality['@name']] = quality['#text']
+    try:
+        tasks = []
+        procs = get_estimation_procedure_list()
+        proc_dict = dict((x['id'], x) for x in procs)
+        for task_ in tasks_dict['oml:tasks']['oml:task']:
+            task = {'tid': int(task_['oml:task_id']),
+                    'did': int(task_['oml:did']),
+                    'name': task_['oml:name'],
+                    'task_type': task_['oml:task_type'],
+                    'status': task_['oml:status']}
+
+            # Other task inputs
+            for input in task_.get('oml:input', list()):
+                if input['@name'] == 'estimation_procedure':
+                    task[input['@name']] = proc_dict[int(input['#text'])]['name']
+                else:
+                    value = input.get('#text')
+                    task[input['@name']] = value
+
+            task[input['@name']] = input['#text']
+
+            # The number of qualities can range from 0 to infinity
+            for quality in task_.get('oml:quality', list()):
+                quality['#text'] = float(quality['#text'])
+                if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
+                    quality['#text'] = int(quality['#text'])
+                task[quality['@name']] = quality['#text']
+    except KeyError as e:
+        raise KeyError("Invalid xml for task: %s" % e)
 
         tasks.append(task)
     tasks.sort(key=lambda t: t['tid'])
diff --git a/openml/testing.py b/openml/testing.py
@@ -9,8 +9,8 @@ class TestBase(unittest.TestCase):
 
     Note
     ----
-    A config file with the username and password must be present to test the
-    API calls.
+    Curently hard-codes a read-write key.
+    Hopefully soon allows using a test server, not the production server.
     """
 
     def setUp(self):
diff --git a/tests/entities/test_dataset.py b/tests/entities/test_dataset.py