ADD /data/list/tag/{tag} api call

mfeurer · mfeurer · commit c3a3766db45d · 2016-03-17T18:07:46.000+01:00
diff --git a/openml/config.py b/openml/config.py
@@ -35,7 +35,6 @@ def _setup():
     private_dir = config.get('FAKE_SECTION', 'private_directory')
     cache_dir = config.get('FAKE_SECTION', 'cachedir')
     set_cache_directory(cache_dir, private_dir)
-    print(config)
 
 
 def set_cache_directory(cachedir, privatedir):
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,10 +1,11 @@
-from .functions import (list_datasets, check_datasets_active,
-                        get_datasets, get_dataset,
+from .functions import (list_datasets, list_datasets_by_tag,
+                        check_datasets_active, get_datasets, get_dataset,
                         get_dataset_description,
                         get_dataset_features, get_dataset_qualities)
 from .dataset import OpenMLDataset
 
 __all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
            'get_datasets_arf', 'get_dataset_features',
            'get_dataset_qualities', 'OpenMLDataset', 'list_datasets',
+           'list_datasets_by_tag',
            'get_dataset_description', 'list_datasets']
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -110,14 +110,33 @@ def list_datasets():
 
     Returns
     -------
-    datasets : list
+    list
         A list of all datasets. Every dataset is represented by a
         dictionary containing the following information: dataset id,
         and status. If qualities are calculated for the dataset, some of
         these are also returned.
     """
+    return _list_datasets("data/list")
+
+
+def list_datasets_by_tag(tag):
+    """Return all datasets having the given tag.
+
+    Returns
+    -------
+    list
+        A list of all datasets having the given tag. Every dataset is
+        represented by a dictionary containing the following information:
+        dataset id, and status. If qualities are calculated for the dataset,
+        some of these are also returned.
+
+    """
+    return _list_datasets("data/list/%s" % tag)
+
+
+def _list_datasets(api_call):
     # TODO add proper error handling here!
-    return_code, xml_string = _perform_api_call("data/list/")
+    return_code, xml_string = _perform_api_call(api_call)
     datasets_dict = xmltodict.parse(xml_string)
 
     # Minimalistic check if the XML is useful
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -53,7 +53,20 @@ def test_list_datasets(self):
         # data from the internet...
         datasets = openml.datasets.list_datasets()
         # 1087 as the number of datasets on openml.org
-        self.assertTrue(len(datasets) >= 1087)
+        self.assertGreaterEqual(len(datasets), 1087)
+        for dataset in datasets:
+            self.assertEqual(type(dataset), dict)
+            self.assertGreaterEqual(len(dataset), 2)
+            self.assertIn('did', dataset)
+            self.assertIsInstance(dataset['did'], int)
+            self.assertIn('status', dataset)
+            self.assertTrue(is_string(dataset['status']))
+            self.assertIn(dataset['status'], ['in_preparation', 'active',
+                                              'deactivated'])
+
+    def test_list_datasets_by_tag(self):
+        datasets = openml.datasets.list_datasets_by_tag('uci')
+        self.assertGreaterEqual(len(datasets), 5)
         for dataset in datasets:
             self.assertEqual(type(dataset), dict)
             self.assertGreaterEqual(len(dataset), 2)