Merge pull request #286 from openml/joaquinvanschoren-patch-listall

mfeurer · web-flow · commit c86b9e57752c · 2017-10-05T19:20:23.000+02:00
Added a function to automatically loop over paged calls
diff --git a/openml/__init__.py b/openml/__init__.py
@@ -24,6 +24,7 @@
 from . import setups
 from . import study
 from . import evaluations
+from . import utils
 from .runs import OpenMLRun
 from .tasks import OpenMLTask, OpenMLSplit
 from .flows import OpenMLFlow
diff --git a/openml/testing.py b/openml/testing.py
@@ -5,6 +5,8 @@
 import time
 import unittest
 
+import six
+
 import openml
 
 
@@ -78,5 +80,15 @@ def _add_sentinel_to_flow_name(self, flow, sentinel=None):
 
         return flow, sentinel
 
+    def _check_dataset(self, dataset):
+        self.assertEqual(type(dataset), dict)
+        self.assertGreaterEqual(len(dataset), 2)
+        self.assertIn('did', dataset)
+        self.assertIsInstance(dataset['did'], int)
+        self.assertIn('status', dataset)
+        self.assertIsInstance(dataset['status'], six.string_types)
+        self.assertIn(dataset['status'], ['in_preparation', 'active',
+                                          'deactivated'])
+
 
 __all__ = ['TestBase']
diff --git a/openml/utils.py b/openml/utils.py
@@ -1,5 +1,7 @@
 import six
 
+from openml.exceptions import OpenMLServerException
+
 
 def extract_xml_tags(xml_tag_name, node, allow_none=True):
     """Helper to extract xml tags from xmltodict.
@@ -37,4 +39,50 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
             return None
         else:
             raise ValueError("Could not find tag '%s' in node '%s'" %
-                             (xml_tag_name, str(node)))
+                             (xml_tag_name, str(node)))
+            
+def list_all(listing_call, batch_size=10000, *args, **filters):
+    """Helper to handle paged listing requests.
+
+    Example usage:
+
+    ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
+
+    Note: I wanted to make this a generator, but this is not possible since all
+    listing calls return dicts
+    
+    Parameters
+    ----------
+    listing_call : callable
+        Call listing, e.g. list_evaluations.
+    batch_size : int (default: 10000)
+        Batch size for paging.
+    *args : Variable length argument list
+        Any required arguments for the listing call.
+    **filters : Arbitrary keyword arguments
+        Any filters that can be applied to the listing function.
+        
+    Returns
+    -------
+    dict
+    """
+    page = 0
+    result = {}
+
+    while True:
+        try:
+            new_batch = listing_call(
+                *args,
+                size=batch_size,
+                offset=batch_size*page,
+                **filters
+            )
+        except OpenMLServerException as e:
+            if page == 0 and e.args[0] == 'No results':
+                raise e
+            else:
+                break
+        result.update(new_batch)
+        page += 1
+
+    return result
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -95,16 +95,6 @@ def test_get_cached_dataset_arff_not_cached(self):
                                 openml.datasets.functions._get_cached_dataset_arff,
                                 3)
 
-    def _check_dataset(self, dataset):
-            self.assertEqual(type(dataset), dict)
-            self.assertGreaterEqual(len(dataset), 2)
-            self.assertIn('did', dataset)
-            self.assertIsInstance(dataset['did'], int)
-            self.assertIn('status', dataset)
-            self.assertIsInstance(dataset['status'], six.string_types)
-            self.assertIn(dataset['status'], ['in_preparation', 'active',
-                                              'deactivated'])
-
     def test_list_datasets(self):
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...