FIX #286, add test, simplify code, cover corner case

mfeurer · mfeurer · commit 67880dd3e979 · 2017-10-05T14:21:44.000+02:00
diff --git a/openml/__init__.py b/openml/__init__.py
@@ -24,6 +24,7 @@
 from . import setups
 from . import study
 from . import evaluations
+from . import utils
 from .runs import OpenMLRun
 from .tasks import OpenMLTask, OpenMLSplit
 from .flows import OpenMLFlow
diff --git a/openml/testing.py b/openml/testing.py
@@ -5,6 +5,8 @@
 import time
 import unittest
 
+import six
+
 import openml
 
 
@@ -78,5 +80,15 @@ def _add_sentinel_to_flow_name(self, flow, sentinel=None):
 
         return flow, sentinel
 
+    def _check_dataset(self, dataset):
+        self.assertEqual(type(dataset), dict)
+        self.assertGreaterEqual(len(dataset), 2)
+        self.assertIn('did', dataset)
+        self.assertIsInstance(dataset['did'], int)
+        self.assertIn('status', dataset)
+        self.assertIsInstance(dataset['status'], six.string_types)
+        self.assertIn(dataset['status'], ['in_preparation', 'active',
+                                          'deactivated'])
+
 
 __all__ = ['TestBase']
diff --git a/openml/utils.py b/openml/utils.py
@@ -1,5 +1,7 @@
 import six
 
+from openml.exceptions import OpenMLServerException
+
 
 def extract_xml_tags(xml_tag_name, node, allow_none=True):
     """Helper to extract xml tags from xmltodict.
@@ -39,31 +41,48 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
             raise ValueError("Could not find tag '%s' in node '%s'" %
                              (xml_tag_name, str(node)))
             
-def list_all(listing_call, *args, **filters):
+def list_all(listing_call, batch_size=10000, *args, **filters):
     """Helper to handle paged listing requests.
-    Example usage: evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)
-    Note: I wanted to make this a generator, but this is not possible since all listing calls return dicts
+
+    Example usage:
+
+    ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
+
+    Note: I wanted to make this a generator, but this is not possible since all
+    listing calls return dicts
     
     Parameters
     ----------
-    listing_call : object
-        Name of the listing call, e.g. list_evaluations
+    listing_call : callable
+        Call listing, e.g. list_evaluations.
+    batch_size : int (default: 10000)
+        Batch size for paging.
     *args : Variable length argument list
-        Any required arguments for the listing call
+        Any required arguments for the listing call.
     **filters : Arbitrary keyword arguments
-        Any filters that need to be applied
+        Any filters that can be applied to the listing function.
         
     Returns
     -------
-    object
+    dict
     """
-    batch_size = 10000
     page = 0
-    has_more = 1
     result = {}
-    while has_more:
-        new_batch = listing_call(*args, size=batch_size, offset=batch_size*page, **filters)
+
+    while True:
+        try:
+            new_batch = listing_call(
+                *args,
+                size=batch_size,
+                offset=batch_size*page,
+                **filters
+            )
+        except OpenMLServerException as e:
+            if page == 0 and e.args[0] == 'No results':
+                raise e
+            else:
+                break
         result.update(new_batch)
         page += 1
-        has_more = (len(new_batch) == batch_size)
+
     return result
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -95,16 +95,6 @@ def test_get_cached_dataset_arff_not_cached(self):
                                 openml.datasets.functions._get_cached_dataset_arff,
                                 3)
 
-    def _check_dataset(self, dataset):
-            self.assertEqual(type(dataset), dict)
-            self.assertGreaterEqual(len(dataset), 2)
-            self.assertIn('did', dataset)
-            self.assertIsInstance(dataset['did'], int)
-            self.assertIn('status', dataset)
-            self.assertIsInstance(dataset['status'], six.string_types)
-            self.assertIn(dataset['status'], ['in_preparation', 'active',
-                                              'deactivated'])
-
     def test_list_datasets(self):
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...