Merge branch 'develop' into feature/upload-flow

mfeurer · mfeurer · commit 6a1660bb2819 · 2016-09-02T17:45:02.000+02:00
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -14,6 +14,7 @@ Changelog
 
 * Add this changelog (Matthias Feurer)
 * 2nd example notebook PyOpenML.ipynb (Joaquin Vanschoren)
+* Pagination support for list datasets and list tasks
 
 API calls
 =========
diff --git a/examples/OpenMLDemo.ipynb b/examples/OpenMLDemo.ipynb
@@ -136,7 +136,7 @@
    "source": [
     "datasets = openml.datasets.list_datasets()\n",
     "\n",
-    "data = pd.DataFrame(datasets)\n",
+    "data = pd.DataFrame(datasets).transpose()\n",
     "print(\"First 10 of %s datasets...\" % len(datasets))\n",
     "print(data[:10][['did','name','NumberOfInstances','NumberOfFeatures']])"
    ]
@@ -569,7 +569,7 @@
    "source": [
     "task_list = openml.tasks.list_tasks()\n",
     "\n",
-    "tasks = pd.DataFrame(task_list)\n",
+    "tasks = pd.DataFrame(task_list).transpose()\n",
     "print(\"First 5 of %s tasks:\" % len(tasks))\n",
     "print(tasks[:5][['tid','did','name','task_type','estimation_procedure']])"
    ]
@@ -688,14 +688,14 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 3.0
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.4.3"
   }
  },
  "nbformat": 4,
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,7 +1,6 @@
-from .functions import (list_datasets, list_datasets_by_tag,
-                        check_datasets_active, get_datasets, get_dataset)
+from .functions import (list_datasets, check_datasets_active,
+                        get_datasets, get_dataset)
 from .dataset import OpenMLDataset
 
 __all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
-           'OpenMLDataset', 'list_datasets', 'list_datasets_by_tag',
-           'list_datasets']
+           'OpenMLDataset', 'list_datasets']
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -42,7 +42,7 @@ def _list_cached_datasets():
             dataset_directory_content = os.listdir(directory_name)
 
             if "dataset.arff" in dataset_directory_content and \
-                    "description.xml" in dataset_directory_content:
+                            "description.xml" in dataset_directory_content:
                 if dataset_id not in datasets:
                     datasets.append(dataset_id)
 
@@ -111,13 +111,22 @@ def _get_cached_dataset_arff(dataset_id):
                                "cached" % dataset_id)
 
 
-def list_datasets():
+def list_datasets(offset=None, size=None, tag=None):
     """Return a list of all dataset which are on OpenML.
 
+    Parameters
+    ----------
+    offset : int, optional
+        the number of datasets to skip, starting from the first
+    size : int, optional
+        the maximum datasets of tasks to show
+    tag : str, optional
+        the tag to include
+
     Returns
     -------
     datasets : list of dicts
-        A list of all datasets.
+        A list of datasets having the given tag (if applicable).
 
         Every dataset is represented by a dictionary containing
         the following information:
@@ -127,22 +136,17 @@ def list_datasets():
         If qualities are calculated for the dataset, some of
         these are also returned.
     """
-    return _list_datasets("data/list")
+    api_call = "data/list"
+    if offset is not None:
+        api_call += "/offset/%d" % int(offset)
 
+    if size is not None:
+       api_call += "/limit/%d" % int(size)
 
-def list_datasets_by_tag(tag):
-    """Return all datasets having the given tag.
-
-    Returns
-    -------
-    datasets : list of dicts
-        A list of all datasets having the given tag. Every dataset is
-        represented by a dictionary containing the following information:
-        dataset id, and status. If qualities are calculated for the dataset,
-        some of these are also returned.
+    if tag is not None:
+        api_call += "/tag/%s" % tag
 
-    """
-    return _list_datasets("data/list/%s" % tag)
+    return _list_datasets(api_call)
 
 
 def _list_datasets(api_call):
@@ -154,11 +158,12 @@ def _list_datasets(api_call):
     assert type(datasets_dict['oml:data']['oml:dataset']) == list, \
         type(datasets_dict['oml:data'])
     assert datasets_dict['oml:data']['@xmlns:oml'] == \
-        'http://openml.org/openml'
+           'http://openml.org/openml'
 
-    datasets = []
+    datasets = dict()
     for dataset_ in datasets_dict['oml:data']['oml:dataset']:
-        dataset = {'did': int(dataset_['oml:did']),
+        did = int(dataset_['oml:did'])
+        dataset = {'did': did,
                    'name': dataset_['oml:name'],
                    'format': dataset_['oml:format'],
                    'status': dataset_['oml:status']}
@@ -169,9 +174,7 @@ def _list_datasets(api_call):
             if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
                 quality['#text'] = int(quality['#text'])
             dataset[quality['@name']] = quality['#text']
-
-        datasets.append(dataset)
-    datasets.sort(key=lambda t: t['did'])
+        datasets[did] = dataset
 
     return datasets
 
diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py
@@ -1,7 +1,5 @@
 from .task import OpenMLTask
 from .split import OpenMLSplit
-from .functions import (get_task, list_tasks, list_tasks_by_type,
-                        list_tasks_by_tag)
+from .functions import (get_task, list_tasks)
 
-__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'list_tasks_by_type',
-           'list_tasks_by_tag', 'OpenMLSplit']
+__all__ = ['OpenMLTask', 'get_task', 'list_tasks', 'OpenMLSplit']
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -89,61 +89,44 @@ def _get_estimation_procedure_list():
     return procs
 
 
-def list_tasks_by_type(task_type_id):
-    """Return a list of all tasks for a given tasks type which are on OpenML.
+def list_tasks(task_type_id=None, offset=None, size=None, tag=None):
+    """Return a number of tasks having the given tag and task_type_id
 
     Parameters
     ----------
-    task_type_id : int
+    task_type_id : int, optional
         ID of the task type as detailed
         `here <http://www.openml.org/search?type=task_type>`_.
+    offset : int, optional
+        the number of tasks to skip, starting from the first
+    size : int, optional
+        the maximum number of tasks to show
+    tag : str, optional
+        the tag to include
 
     Returns
     -------
     list
-        A list of all tasks of the given task type. Every task is represented by
-        a dictionary containing the following information: task id,
-        dataset id, task_type and status. If qualities are calculated for
-        the associated dataset, some of these are also returned.
+        A list of all tasks having the given task_type_id and the give tag.
+        Every task is represented by a dictionary containing the following
+        information: task id, dataset id, task_type and status. If qualities
+        are calculated for the associated dataset, some of these are also
+        returned.
     """
-    try:
-        task_type_id = int(task_type_id)
-    except:
-        raise ValueError("Task Type ID is neither an Integer nor can be "
-                         "cast to an Integer.")
-    return _list_tasks("task/list/type/%d" % task_type_id)
-
-
-def list_tasks_by_tag(tag):
-    """Return all tasks having the given tag
+    api_call = "task/list"
+    if task_type_id is not None:
+        api_call += "/type/%d" % int(task_type_id)
 
-    Parameters
-    ----------
-    tag : str
-
-    Returns
-    -------
-    list
-        A list of all tasks having a give tag. Every task is represented by
-        a dictionary containing the following information: task id,
-        dataset id, task_type and status. If qualities are calculated for
-        the associated dataset, some of these are also returned.
-    """
-    return _list_tasks("task/list/tag/%s" % tag)
+    if offset is not None:
+        api_call += "/offset/%d" % int(offset)
 
+    if size is not None:
+        api_call += "/limit/%d" % int(size)
 
-def list_tasks():
-    """Return a list of all tasks which are on OpenML.
+    if tag is not None:
+        api_call += "/tag/%s" % tag
 
-    Returns
-    -------
-    list
-        A list of all tasks. Every task is represented by a
-        dictionary containing the following information: task id,
-        dataset id, task_type and status. If qualities are calculated for
-        the associated dataset, some of these are also returned.
-    """
-    return _list_tasks('task/list')
+    return _list_tasks(api_call)
 
 
 def _list_tasks(api_call):
@@ -162,12 +145,15 @@ def _list_tasks(api_call):
                          '"oml:runs"/@xmlns:oml is not '
                          '"http://openml.org/openml": %s'
                          % str(tasks_dict))
+
     try:
-        tasks = []
+        tasks = dict();
         procs = _get_estimation_procedure_list()
         proc_dict = dict((x['id'], x) for x in procs)
         for task_ in tasks_dict['oml:tasks']['oml:task']:
-            task = {'tid': int(task_['oml:task_id']),
+            tid = int(task_['oml:task_id'])
+            task = {'tid': tid,
+                    'ttid': int(task_['oml:task_type_id']),
                     'did': int(task_['oml:did']),
                     'name': task_['oml:name'],
                     'task_type': task_['oml:task_type'],
@@ -187,12 +173,10 @@ def _list_tasks(api_call):
                 if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
                     quality['#text'] = int(quality['#text'])
                 task[quality['@name']] = quality['#text']
-            tasks.append(task)
+            tasks[tid] = task
     except KeyError as e:
         raise KeyError("Invalid xml for task: %s" % e)
 
-    tasks.sort(key=lambda t: t['tid'])
-
     return tasks
 
 
@@ -262,7 +246,7 @@ def _create_task_from_xml(xml):
         estimation_parameters[name] = text
 
     return OpenMLTask(
-        dic["oml:task_id"], dic["oml:task_type"],
+        dic["oml:task_id"], dic['oml:task_type_id'], dic["oml:task_type"],
         inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
         inputs["source_data"]["oml:data_set"]["oml:target_feature"],
         inputs["estimation_procedure"]["oml:estimation_procedure"][
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -9,8 +9,8 @@
 
 
 class OpenMLTask(object):
-    def __init__(self, task_id, task_type, data_set_id, target_name,
-                 estimation_procedure_type, data_splits_url,
+    def __init__(self, task_id, task_type_id, task_type, data_set_id,
+                 target_name, estimation_procedure_type, data_splits_url,
                  estimation_parameters, evaluation_measure, cost_matrix,
                  class_labels=None):
         self.task_id = int(task_id)
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -71,13 +71,7 @@ def test_get_cached_dataset_arff_not_cached(self):
                                 openml.datasets.functions._get_cached_dataset_arff,
                                 3)
 
-    def test_list_datasets(self):
-        # We can only perform a smoke test here because we test on dynamic
-        # data from the internet...
-        datasets = openml.datasets.list_datasets()
-        # 1087 as the number of datasets on openml.org
-        self.assertGreaterEqual(len(datasets), 1087)
-        for dataset in datasets:
+    def _check_dataset(self, dataset):
             self.assertEqual(type(dataset), dict)
             self.assertGreaterEqual(len(dataset), 2)
             self.assertIn('did', dataset)
@@ -87,18 +81,29 @@ def test_list_datasets(self):
             self.assertIn(dataset['status'], ['in_preparation', 'active',
                                               'deactivated'])
 
+    def test_list_datasets(self):
+        # We can only perform a smoke test here because we test on dynamic
+        # data from the internet...
+        datasets = openml.datasets.list_datasets()
+        # 1087 as the number of datasets on openml.org
+        self.assertGreaterEqual(len(datasets), 1087)
+        for did in datasets:
+            self._check_dataset(datasets[did])
+
     def test_list_datasets_by_tag(self):
-        datasets = openml.datasets.list_datasets_by_tag('uci')
+        datasets = openml.datasets.list_datasets(tag='uci')
         self.assertGreaterEqual(len(datasets), 5)
-        for dataset in datasets:
-            self.assertEqual(type(dataset), dict)
-            self.assertGreaterEqual(len(dataset), 2)
-            self.assertIn('did', dataset)
-            self.assertIsInstance(dataset['did'], int)
-            self.assertIn('status', dataset)
-            self.assertTrue(is_string(dataset['status']))
-            self.assertIn(dataset['status'], ['in_preparation', 'active',
-                                              'deactivated'])
+        for did in datasets:
+            self._check_dataset(datasets[did])
+
+    def test_list_datasets_paginate(self):
+        size = 10
+        max = 100
+        for i in range(0, max, size):
+            datasets = openml.datasets.list_datasets(offset=i, size=size)
+            self.assertGreaterEqual(size, len(datasets))
+            for did in datasets:
+                self._check_dataset(datasets[did])
 
     @unittest.skip('See https://github.com/openml/openml-python/issues/149')
     def test_check_datasets_active(self):
diff --git a/tests/files/tasks/1/task.xml b/tests/files/tasks/1/task.xml
diff --git a/tests/files/tasks/1882/task.xml b/tests/files/tasks/1882/task.xml
diff --git a/tests/files/tasks/3/task.xml b/tests/files/tasks/3/task.xml
diff --git a/tests/tasks/test_task_functions.py b/tests/tasks/test_task_functions.py