Lazy download of data splits (#659)

Neeratyoy · mfeurer · commit 6b5dfe626b1f · 2019-04-09T17:27:46.000+02:00
* Added comments in examples for dataset 68 belonging to only test server * Added comment in flow and run example for dataset 68 belonging to only test server * Making download of datasplits optional and adding a relevant unit test * Adding error handling for task ID type * Changes suggested by Matthias on PR #659 * Removing inappropriate dataset check from test case * Fixing docstring * Fixing whitespace issue for PEP8
diff --git a/examples/datasets_tutorial.py b/examples/datasets_tutorial.py
@@ -45,6 +45,7 @@
 
 # This is done based on the dataset ID ('did').
 dataset = openml.datasets.get_dataset(68)
+# NOTE: Dataset 68 exists on the test server https://test.openml.org/d/68
 
 # Print a summary
 print("This is dataset '%s', the target feature is '%s'" %
@@ -84,7 +85,7 @@
 # Whenever you use any functionality that requires the data,
 # such as `get_data`, the data will be downloaded.
 dataset = openml.datasets.get_dataset(68, download_data=False)
-
+# NOTE: Dataset 68 exists on the test server https://test.openml.org/d/68
 
 ############################################################################
 # Exercise 2
diff --git a/examples/flows_and_runs_tutorial.py b/examples/flows_and_runs_tutorial.py
@@ -15,6 +15,7 @@
 #
 # Train a scikit-learn model on the data manually.
 
+# NOTE: Dataset 68 exists on the test server https://test.openml.org/d/68
 dataset = openml.datasets.get_dataset(68)
 X, y = dataset.get_data(
     dataset_format='array',
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -277,35 +277,51 @@ def __list_tasks(api_call):
     return tasks
 
 
-def get_tasks(task_ids):
+def get_tasks(task_ids, download_data=True):
     """Download tasks.
 
     This function iterates :meth:`openml.tasks.get_task`.
 
     Parameters
     ----------
     task_ids : iterable
-        Integers representing task ids.
+        Integers/Strings representing task ids.
+    download_data : bool
+        Option to trigger download of data along with the meta data.
 
     Returns
     -------
     list
     """
     tasks = []
     for task_id in task_ids:
-        tasks.append(get_task(task_id))
+        tasks.append(get_task(task_id, download_data))
     return tasks
 
 
-def get_task(task_id):
-    """Download the OpenML task for a given task ID.
+def get_task(task_id, download_data=True):
+    """Download OpenML task for a given task ID.
+
+    Downloads the task representation, while the data splits can be
+    downloaded optionally based on the additional parameter. Else,
+    splits will either way be downloaded when the task is being used.
 
     Parameters
     ----------
-    task_id : int
+    task_id : int or str
         The OpenML task id.
+    download_data : bool
+        Option to trigger download of data along with the meta data.
+
+    Returns
+    -------
+    task
     """
-    task_id = int(task_id)
+    try:
+        task_id = int(task_id)
+    except (ValueError, TypeError):
+        raise ValueError("Dataset ID is neither an Integer nor can be "
+                         "cast to an Integer.")
 
     with lockutils.external_lock(
             name='task.functions.get_task:%d' % task_id,
@@ -317,14 +333,18 @@ def get_task(task_id):
 
         try:
             task = _get_task_description(task_id)
-            dataset = get_dataset(task.dataset_id)
+            dataset = get_dataset(task.dataset_id, download_data)
+            # List of class labels availaible in dataset description
+            # Including class labels as part of task meta data handles
+            #   the case where data download was initially disabled
+            if isinstance(task, OpenMLClassificationTask):
+                task.class_labels = \
+                    dataset.retrieve_class_labels(task.target_name)
             # Clustering tasks do not have class labels
             # and do not offer download_split
-            if isinstance(task, OpenMLSupervisedTask):
-                task.download_split()
-                if isinstance(task, OpenMLClassificationTask):
-                    task.class_labels = \
-                        dataset.retrieve_class_labels(task.target_name)
+            if download_data:
+                if isinstance(task, OpenMLSupervisedTask):
+                    task.download_split()
         except Exception as e:
             openml.utils._remove_cache_dir_for_id(
                 TASKS_CACHE_DIR_NAME,
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
@@ -129,6 +129,27 @@ def test_get_task(self):
             self.workdir, 'org', 'openml', 'test', "datasets", "1", "dataset.arff"
         )))
 
+    def test_get_task_lazy(self):
+        task = openml.tasks.get_task(2, download_data=False)
+        self.assertIsInstance(task, OpenMLTask)
+        self.assertTrue(os.path.exists(os.path.join(
+            self.workdir, 'org', 'openml', 'test', "tasks", "2", "task.xml",
+        )))
+        self.assertEqual(task.class_labels, ['1', '2', '3', '4', '5', 'U'])
+
+        self.assertFalse(os.path.exists(os.path.join(
+            self.workdir, 'org', 'openml', 'test', "tasks", "2", "datasplits.arff"
+        )))
+        # Since the download_data=False is propagated to get_dataset
+        self.assertFalse(os.path.exists(os.path.join(
+            self.workdir, 'org', 'openml', 'test', "datasets", "2", "dataset.arff"
+        )))
+
+        task.download_split()
+        self.assertTrue(os.path.exists(os.path.join(
+            self.workdir, 'org', 'openml', 'test', "tasks", "2", "datasplits.arff"
+        )))
+
     @mock.patch('openml.tasks.functions.get_dataset')
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`#`
`16`	`16`	`# Train a scikit-learn model on the data manually.`
`17`	`17`
	`18`	`+# NOTE: Dataset 68 exists on the test server https://test.openml.org/d/68`
`18`	`19`	`dataset = openml.datasets.get_dataset(68)`
`19`	`20`	`X, y = dataset.get_data(`
`20`	`21`	`dataset_format='array',`