Merge pull request #160 from openml/paginationsupport

mfeurer · web-flow · commit 53e8331e78ec · 2016-09-02T09:51:38.000+02:00
dataset and task pagination, issue #142
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,7 +1,6 @@
-from .functions import (list_datasets, list_datasets_by_tag,
-                        check_datasets_active, get_datasets, get_dataset)
+from .functions import (list_datasets, check_datasets_active,
+                        get_datasets, get_dataset)
 from .dataset import OpenMLDataset
 
 __all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
-           'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
-           'list_datasets']
+           'OpenMLDataset', 'list_datasets']
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -42,7 +42,7 @@ def _list_cached_datasets():
             dataset_directory_content = os.listdir(directory_name)
 
             if "dataset.arff" in dataset_directory_content and \
-                    "description.xml" in dataset_directory_content:
+                            "description.xml" in dataset_directory_content:
                 if dataset_id not in datasets:
                     datasets.append(dataset_id)
 
@@ -111,13 +111,22 @@ def _get_cached_dataset_arff(dataset_id):
                                "cached" % dataset_id)
 
 
-def list_datasets():
+def list_datasets(offset=None, size=None, tag=None):
     """Return a list of all dataset which are on OpenML.
 
+    Parameters
+    ----------
+    offset : int, optional
+        the number of datasets to skip, starting from the first
+    size : int, optional
+        the maximum datasets of tasks to show
+    tag : str, optional
+        the tag to include
+
     Returns
     -------
     datasets : list of dicts
-        A list of all datasets.
+        A list of datasets having the given tag (if applicable).
 
         Every dataset is represented by a dictionary containing
         the following information:
@@ -127,22 +136,17 @@ def list_datasets():
         If qualities are calculated for the dataset, some of
         these are also returned.
     """
-    return _list_datasets("data/list")
-
+    api_call = "data/list"
+    if offset is not None:
+        api_call += "/offset/%d" % int(offset)
 
-def list_datasets_by_tag(tag):
-    """Return all datasets having the given tag.
+    if size is not None:
+       api_call += "/limit/%d" % int(size)
 
-    Returns
-    -------
-    datasets : list of dicts
-        A list of all datasets having the given tag. Every dataset is
-        represented by a dictionary containing the following information:
-        dataset id, and status. If qualities are calculated for the dataset,
-        some of these are also returned.
+    if tag is not None:
+        api_call += "/tag/%s" % tag
 
-    """
-    return _list_datasets("data/list/%s" % tag)
+    return _list_datasets(api_call)
 
 
 def _list_datasets(api_call):
@@ -154,7 +158,7 @@ def _list_datasets(api_call):
     assert type(datasets_dict['oml:data']['oml:dataset']) == list, \
         type(datasets_dict['oml:data'])
     assert datasets_dict['oml:data']['@xmlns:oml'] == \
-        'http://openml.org/openml'
+           'http://openml.org/openml'
 
     datasets = []
     for dataset_ in datasets_dict['oml:data']['oml:dataset']:
diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py
@@ -1,7 +1,5 @@
 from .task import OpenMLTask
 from .split import OpenMLSplit
-from .functions import (get_task, list_tasks, list_tasks_by_type,
-                        list_tasks_by_tag)
+from .functions import (get_task, list_tasks)
 
-__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'list_tasks_by_type',
-           'list_tasks_by_tag', 'OpenMLSplit']
+__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'OpenMLSplit']
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -89,61 +89,44 @@ def _get_estimation_procedure_list():
     return procs
 
 
-def list_tasks_by_type(task_type_id):
-    """Return a list of all tasks for a given tasks type which are on OpenML.
+def list_tasks(task_type_id=None, offset=None, size=None, tag=None):
+    """Return a number of tasks having the given tag and task_type_id
 
     Parameters
     ----------
-    task_type_id : int
+    task_type_id : int, optional
         ID of the task type as detailed
         `here <http://www.openml.org/search?type=task_type>`_.
+    offset : int, optional
+        the number of tasks to skip, starting from the first
+    size : int, optional
+        the maximum number of tasks to show
+    tag : str, optional
+        the tag to include
 
     Returns
     -------
     list
-        A list of all tasks of the given task type. Every task is represented by
-        a dictionary containing the following information: task id,
-        dataset id, task_type and status. If qualities are calculated for
-        the associated dataset, some of these are also returned.
+        A list of all tasks having the given task_type_id and the give tag.
+        Every task is represented by a dictionary containing the following
+        information: task id, dataset id, task_type and status. If qualities
+        are calculated for the associated dataset, some of these are also
+        returned.
     """
-    try:
-        task_type_id = int(task_type_id)
-    except:
-        raise ValueError("Task Type ID is neither an Integer nor can be "
-                         "cast to an Integer.")
-    return _list_tasks("task/list/type/%d" % task_type_id)
-
-
-def list_tasks_by_tag(tag):
-    """Return all tasks having the given tag
-
-    Parameters
-    ----------
-    tag : str
+    api_call = "task/list"
+    if task_type_id is not None:
+        api_call += "/task_type_id/%d" % int(task_type_id)
 
-    Returns
-    -------
-    list
-        A list of all tasks having a give tag. Every task is represented by
-        a dictionary containing the following information: task id,
-        dataset id, task_type and status. If qualities are calculated for
-        the associated dataset, some of these are also returned.
-    """
-    return _list_tasks("task/list/tag/%s" % tag)
+    if offset is not None:
+        api_call += "/offset/%d" % int(offset)
 
+    if size is not None:
+        api_call += "/limit/%d" % int(size)
 
-def list_tasks():
-    """Return a list of all tasks which are on OpenML.
+    if tag is not None:
+        api_call += "/tag/%s" % tag
 
-    Returns
-    -------
-    list
-        A list of all tasks. Every task is represented by a
-        dictionary containing the following information: task id,
-        dataset id, task_type and status. If qualities are calculated for
-        the associated dataset, some of these are also returned.
-    """
-    return _list_tasks('task/list')
+    return _list_tasks(api_call)
 
 
 def _list_tasks(api_call):
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -88,7 +88,7 @@ def test_list_datasets(self):
                                               'deactivated'])
 
     def test_list_datasets_by_tag(self):
-        datasets = openml.datasets.list_datasets_by_tag('uci')
+        datasets = openml.datasets.list_datasets(tag='uci')
         self.assertGreaterEqual(len(datasets), 5)
         for dataset in datasets:
             self.assertEqual(type(dataset), dict)
@@ -100,6 +100,22 @@ def test_list_datasets_by_tag(self):
             self.assertIn(dataset['status'], ['in_preparation', 'active',
                                               'deactivated'])
 
+    def test_list_datasets_paginate(self):
+        size = 10
+        max = 100
+        for i in range(0, max, size):
+            data = openml.datasets.list_datasets(offset=i, size=size)
+            self.assertGreaterEqual(size, len(data))
+            for dataset in data:
+                self.assertEqual(type(dataset), dict)
+                self.assertGreaterEqual(len(dataset), 2)
+                self.assertIn('did', dataset)
+                self.assertIsInstance(dataset['did'], int)
+                self.assertIn('status', dataset)
+                self.assertTrue(is_string(dataset['status']))
+                self.assertIn(dataset['status'], ['in_preparation',
+                                                  'active', 'deactivated'])
+
     @unittest.skip('See https://github.com/openml/openml-python/issues/149')
     def test_check_datasets_active(self):
         active = openml.datasets.check_datasets_active([1, 17])
diff --git a/tests/tasks/test_task_functions.py b/tests/tasks/test_task_functions.py
@@ -51,13 +51,13 @@ def _check_task(self, task):
                       ['in_preparation', 'active', 'deactivated'])
 
     def test_list_tasks_by_type(self):
-        tasks = openml.tasks.list_tasks_by_type(task_type_id=3)
+        tasks = openml.tasks.list_tasks(task_type_id=3)
         self.assertGreaterEqual(len(tasks), 300)
         for task in tasks:
             self._check_task(task)
 
     def test_list_tasks_by_tag(self):
-        tasks = openml.tasks.list_tasks_by_tag('basic')
+        tasks = openml.tasks.list_tasks(tag='basic')
         self.assertGreaterEqual(len(tasks), 57)
         for task in tasks:
             self._check_task(task)
@@ -68,6 +68,42 @@ def test_list_tasks(self):
         for task in tasks:
             self._check_task(task)
 
+    def test_list_tasks_paginate(self):
+        size = 10
+        max = 100
+        for i in range(0, max, size):
+            tasks = openml.tasks.list_tasks(offset=i, size=size)
+            self.assertGreaterEqual(size, len(tasks))
+            for task in tasks:
+                self.assertEqual(type(task), dict)
+                self.assertGreaterEqual(len(task), 4)
+                self.assertIn('tid', task)
+                self.assertIsInstance(task['tid'], int)
+                self.assertIn('did', task)
+                self.assertIsInstance(task['did'], int)
+                self.assertIn('status', task)
+                self.assertTrue(is_string(task['status']))
+                self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated'])
+
+    def test_list_tasks_per_type_paginate(self):
+        size = 10
+        max = 100
+        task_types = 5
+        for j in range(1,task_types):
+            for i in range(0, max, size):
+                tasks = openml.tasks.list_tasks(task_type_id=j, offset=i, size=size)
+                self.assertGreaterEqual(size, len(tasks))
+                for task in tasks:
+                    self.assertEqual(type(task), dict)
+                    self.assertGreaterEqual(len(task), 4)
+                    self.assertIn('tid', task)
+                    self.assertIsInstance(task['tid'], int)
+                    self.assertIn('did', task)
+                    self.assertIsInstance(task['did'], int)
+                    self.assertIn('status', task)
+                    self.assertTrue(is_string(task['status']))
+                    self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated'])
+
     def test__get_task(self):
         openml.config.set_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)