Merge pull request #98 from openml/features/finish_api

mfeurer · mfeurer · commit 755c52b887e1 · 2016-03-17T19:22:12.000+01:00
Features/finish api
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -16,16 +16,16 @@ API call                                        implemented tested properly test
 /data/features/{id}                             yes         yes
 /data/qualities/{id}                            yes         yes
 /data/list/                                     yes         yes
-/data/list/tag/{tag}
+/data/list/tag/{tag}                            yes         yes
 /data/upload/                                   yes         yes
 /data/tag
 /data/untag
 /data/delete/                                   X
 
 /task/{task}                                    yes         yes
 /task/list                                      yes         yes
-/task/list/type/{id}
-/task/list/tag/{tag}
+/task/list/type/{id}                            yes         yes
+/task/list/tag/{tag}                            yes         yes
 /task {POST}
 /task/tag
 /task/untag
diff --git a/openml/config.py b/openml/config.py
@@ -35,7 +35,6 @@ def _setup():
     private_dir = config.get('FAKE_SECTION', 'private_directory')
     cache_dir = config.get('FAKE_SECTION', 'cachedir')
     set_cache_directory(cache_dir, private_dir)
-    print(config)
 
 
 def set_cache_directory(cachedir, privatedir):
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,10 +1,11 @@
-from .functions import (list_datasets, check_datasets_active,
-                        get_datasets, get_dataset,
+from .functions import (list_datasets, list_datasets_by_tag,
+                        check_datasets_active, get_datasets, get_dataset,
                         get_dataset_description,
                         get_dataset_features, get_dataset_qualities)
 from .dataset import OpenMLDataset
 
 __all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
            'get_datasets_arf', 'get_dataset_features',
            'get_dataset_qualities', 'OpenMLDataset', 'list_datasets',
+           'list_datasets_by_tag',
            'get_dataset_description', 'list_datasets']
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -110,14 +110,33 @@ def list_datasets():
 
     Returns
     -------
-    datasets : list
+    list
         A list of all datasets. Every dataset is represented by a
         dictionary containing the following information: dataset id,
         and status. If qualities are calculated for the dataset, some of
         these are also returned.
     """
+    return _list_datasets("data/list")
+
+
+def list_datasets_by_tag(tag):
+    """Return all datasets having the given tag.
+
+    Returns
+    -------
+    list
+        A list of all datasets having the given tag. Every dataset is
+        represented by a dictionary containing the following information:
+        dataset id, and status. If qualities are calculated for the dataset,
+        some of these are also returned.
+
+    """
+    return _list_datasets("data/list/%s" % tag)
+
+
+def _list_datasets(api_call):
     # TODO add proper error handling here!
-    return_code, xml_string = _perform_api_call("data/list/")
+    return_code, xml_string = _perform_api_call(api_call)
     datasets_dict = xmltodict.parse(xml_string)
 
     # Minimalistic check if the XML is useful
diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py
@@ -1,5 +1,7 @@
 from .task import OpenMLTask
 from .split import OpenMLSplit
-from .task_functions import get_task, list_tasks
+from .task_functions import get_task, list_tasks, list_tasks_by_type, \
+    list_tasks_by_tag
 
-__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'OpenMLSplit']
+__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'list_tasks_by_type',
+           'list_tasks_by_tag', 'OpenMLSplit']
diff --git a/openml/tasks/task_functions.py b/openml/tasks/task_functions.py
@@ -81,20 +81,20 @@ def get_estimation_procedure_list():
     return procs
 
 
-def list_tasks(task_type_id=1):
-    """Return a list of all tasks which are on OpenML.
+def list_tasks_by_type(task_type_id):
+    """Return a list of all tasks for a given tasks type which are on OpenML.
 
     Parameters
     ----------
     task_type_id : int
         ID of the task type as detailed
-        `here <http://openml.org/api/?f=openml.task.types>`_.
+        `here <http://www.openml.org/search?type=task_type>`_.
 
     Returns
     -------
-    tasks : list
-        A list of all tasks. Every task is represented by a
-        dictionary containing the following information: task id,
+    list
+        A list of all tasks of the given task type. Every task is represented by
+        a dictionary containing the following information: task id,
         dataset id, task_type and status. If qualities are calculated for
         the associated dataset, some of these are also returned.
     """
@@ -103,14 +103,57 @@ def list_tasks(task_type_id=1):
     except:
         raise ValueError("Task Type ID is neither an Integer nor can be "
                          "cast to an Integer.")
+    return _list_tasks("task/list/type/%d" % task_type_id)
 
-    return_code, xml_string = _perform_api_call(
-        "task/list/type/%d" % task_type_id)
+
+def list_tasks_by_tag(tag):
+    """Return all tasks having the given tag
+
+    Parameters
+    ----------
+    tag : str
+
+    Returns
+    -------
+    list
+        A list of all tasks having a give tag. Every task is represented by
+        a dictionary containing the following information: task id,
+        dataset id, task_type and status. If qualities are calculated for
+        the associated dataset, some of these are also returned.
+    """
+    return _list_tasks("task/list/tag/%s" % tag)
+
+
+def list_tasks():
+    """Return a list of all tasks which are on OpenML.
+
+    Returns
+    -------
+    list
+        A list of all tasks. Every task is represented by a
+        dictionary containing the following information: task id,
+        dataset id, task_type and status. If qualities are calculated for
+        the associated dataset, some of these are also returned.
+    """
+    return _list_tasks('task/list')
+
+
+def _list_tasks(api_call):
+    return_code, xml_string = _perform_api_call(api_call)
     tasks_dict = xmltodict.parse(xml_string)
     # Minimalistic check if the XML is useful
-    assert tasks_dict['oml:tasks']['@xmlns:oml'] == \
-        'http://openml.org/openml'
-    assert type(tasks_dict['oml:tasks']['oml:task']) == list
+    if 'oml:tasks' not in tasks_dict:
+        raise ValueError('Error in return XML, does not contain "oml:runs": %s'
+                         % str(tasks_dict))
+    elif '@xmlns:oml' not in tasks_dict['oml:tasks']:
+        raise ValueError('Error in return XML, does not contain '
+                         '"oml:runs"/@xmlns:oml: %s'
+                         % str(tasks_dict))
+    elif tasks_dict['oml:tasks']['@xmlns:oml'] != 'http://openml.org/openml':
+        raise ValueError('Error in return XML, value of  '
+                         '"oml:runs"/@xmlns:oml is not '
+                         '"http://openml.org/openml": %s'
+                         % str(tasks_dict))
 
     tasks = []
     procs = get_estimation_procedure_list()
@@ -127,7 +170,8 @@ def list_tasks(task_type_id=1):
             if input['@name'] == 'estimation_procedure':
                 task[input['@name']] = proc_dict[int(input['#text'])]['name']
             else:
-                task[input['@name']] = input['#text']
+                value = input.get('#text')
+                task[input['@name']] = value
 
         task[input['@name']] = input['#text']
 
diff --git a/tests/examples/test_OpenMLDemo.py b/tests/examples/test_OpenMLDemo.py
@@ -45,7 +45,7 @@ def test_notebook(self):
                 msg = 'Error executing the notebook "%s". ' % notebook_filename
                 msg += 'See notebook "%s" for the traceback.\n\n' % notebook_filename_out
                 msg += e.traceback
-                self.fail(msg)
+                 self.fail(msg)
             finally:
                 with open(notebook_filename_out, mode='wt') as f:
                     nbformat.write(nb, f)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -53,7 +53,20 @@ def test_list_datasets(self):
         # data from the internet...
         datasets = openml.datasets.list_datasets()
         # 1087 as the number of datasets on openml.org
-        self.assertTrue(len(datasets) >= 1087)
+        self.assertGreaterEqual(len(datasets), 1087)
+        for dataset in datasets:
+            self.assertEqual(type(dataset), dict)
+            self.assertGreaterEqual(len(dataset), 2)
+            self.assertIn('did', dataset)
+            self.assertIsInstance(dataset['did'], int)
+            self.assertIn('status', dataset)
+            self.assertTrue(is_string(dataset['status']))
+            self.assertIn(dataset['status'], ['in_preparation', 'active',
+                                              'deactivated'])
+
+    def test_list_datasets_by_tag(self):
+        datasets = openml.datasets.list_datasets_by_tag('uci')
+        self.assertGreaterEqual(len(datasets), 5)
         for dataset in datasets:
             self.assertEqual(type(dataset), dict)
             self.assertGreaterEqual(len(dataset), 2)
diff --git a/tests/test_task.py b/tests/test_task.py
@@ -7,29 +7,36 @@
 
 
 class TestTask(TestBase):
+    def _check_task(self, task):
+        self.assertEqual(type(task), dict)
+        self.assertGreaterEqual(len(task), 2)
+        self.assertIn('did', task)
+        self.assertIsInstance(task['did'], int)
+        self.assertIn('status', task)
+        self.assertTrue(is_string(task['status']))
+        self.assertIn(task['status'],
+                      ['in_preparation', 'active', 'deactivated'])
+
     def test_list_tasks(self):
-        # We can only perform a smoke test here because we test on dynamic
-        # data from the internet...
-        def check_task(task):
-            self.assertEqual(type(task), dict)
-            self.assertGreaterEqual(len(task), 2)
-            self.assertIn('did', task)
-            self.assertIsInstance(task['did'], int)
-            self.assertIn('status', task)
-            self.assertTrue(is_string(task['status']))
-            self.assertIn(task['status'],
-                          ['in_preparation', 'active', 'deactivated'])
-
-        # use a small task type as we cant limit tasks.
-        # TODO inspect the tasks maybe?
-        tasks = openml.tasks.list_tasks(task_type_id=3)
+        tasks = openml.tasks.list_tasks()
+        self.assertGreaterEqual(len(tasks), 2000)
+        for task in tasks:
+            self._check_task(task)
+
+    def test_list_tasks_by_type(self):
+        tasks = openml.tasks.list_tasks_by_type(task_type_id=3)
         self.assertGreaterEqual(len(tasks), 300)
         for task in tasks:
-            check_task(task)
+            self._check_task(task)
+
+    def test_list_tasks_by_tag(self):
+        tasks = openml.tasks.list_tasks_by_tag('basic')
+        self.assertGreaterEqual(len(tasks), 57)
+        for task in tasks:
+            self._check_task(task)
 
     def test_get_task(self):
         task = openml.tasks.get_task(1)
-        print(task)
         self.assertTrue(os.path.exists(
             os.path.join(os.getcwd(), "tasks", "1", "task.xml")))
         self.assertTrue(os.path.exists(