Merge pull request #14 from mfeurer/feature/download_run

mfeurer · mfeurer · commit 2f625fafb10e · 2015-09-03T18:28:45.000+02:00
Add API calls for /run/list/ and /run/{run_id}/
diff --git a/openml/apiconnector.py b/openml/apiconnector.py
@@ -1,5 +1,4 @@
 from collections import OrderedDict
-import hashlib
 import logging
 import os
 import re
@@ -25,8 +24,11 @@
 from .entities.dataset import OpenMLDataset
 from .entities.task import OpenMLTask
 from .entities.split import OpenMLSplit
+from .entities.run import OpenMLRun
 from .util import is_string
 
+import numpy as np
+
 logger = logging.getLogger(__name__)
 
 OPENML_URL = "http://api_new.openml.org/v1/"
@@ -806,16 +808,163 @@ def _download_split(self, task, cache_file):
 
     def _create_task_cache_dir(self, task_id):
         task_cache_dir = os.path.join(self.task_cache_dir, str(task_id))
+
         try:
             os.makedirs(task_cache_dir)
         except (IOError, OSError):
             # TODO add debug information!
             pass
         return task_cache_dir
 
-    def _perform_api_call(self, call, data=None, filePath=None, add_authentication=True):
+    ############################################################################
+    # Runs
+    def get_runs_list(self, task_id=None, flow_id=None, setup_id=None):
+        """Return a list of all runs for either a task, flow or setup.
+
+        Exactly one of the optional parameters must be given.
+
+        Parameters
+        ----------
+        task_id : int, optional
+        flow_id : int, optional
+        setup_id : int, optional
+
+        Returns
+        -------
+        list
+            A list of all runs run IDs for a given ID.
         """
-        Perform an API call at the OpenML server.
+        test = [task_id is None, flow_id is None, setup_id is None]
+        if np.nansum(test) != 2:
+            raise ValueError
+
+        call = "run/list"
+
+        if task_id is not None:
+            call += "?task_id=%d" % task_id
+        elif flow_id is not None:
+            call += "?implementation_id=%d" % flow_id
+        elif setup_id is not None:
+            call += "?setup_id=%d" % setup_id
+
+        return_code, xml_string = self._perform_api_call(call)
+        datasets_dict = xmltodict.parse(xml_string)
+
+
+        if isinstance(datasets_dict['oml:runs']['oml:run'], dict):
+            runs = [datasets_dict['oml:runs']['oml:run']]
+        else:
+            # Minimalistic check if the XML is useful
+            assert type(datasets_dict['oml:runs']['oml:run']) == list, \
+                type(datasets_dict['oml:runs']['oml:run'])
+            assert datasets_dict['oml:runs']['@xmlns:oml'] == \
+                   'http://openml.org/openml'
+
+            runs = []
+            for runs_ in datasets_dict['oml:runs']['oml:run']:
+                run = {'run_id': int(runs_['oml:run_id']),
+                       'task_id': int(runs_['oml:task_id']),
+                       'setup_id': int(runs_['oml:setup_id']),
+                       'implementation_id': int(runs_['oml:implementation_id']),
+                       'uploader': int(runs_['oml:uploader'])}
+
+                runs.append(run)
+            runs.sort(key=lambda t: t['run_id'])
+
+        return runs
+
+    def download_run(self, run_id):
+        """Download the OpenML run for a given run ID.
+
+        Parameters
+        ----------
+        run_id : int
+            The OpenML run id.
+        """
+        try:
+            run_id = int(run_id)
+        except:
+            raise ValueError("Task ID is neither an Integer nor can be "
+                             "cast to an Integer.")
+
+        xml_file = os.path.join(self._create_run_cache_dir(run_id),
+                                "run.xml")
+
+        try:
+            with open(xml_file) as fh:
+                run = self._create_run_from_xml(fh.read())
+        except (OSError, IOError):
+
+            try:
+                return_code, run_xml = self._perform_api_call(
+                    "run/%d" % run_id)
+            except (URLError, UnicodeEncodeError) as e:
+                print(e)
+                raise e
+
+            # Cache the xml task file
+            if os.path.exists(xml_file):
+                with open(xml_file) as fh:
+                    local_xml = fh.read()
+
+                if run_xml != local_xml:
+                    raise ValueError("Run description of run %d cached at %s "
+                                     "has changed." % (run_id, xml_file))
+
+            else:
+                with open(xml_file, "w") as fh:
+                    fh.write(run_xml)
+
+            run = self._create_run_from_xml(run_xml)
+
+        return run
+
+    def _create_run_cache_dir(self, run_id):
+        run_cache_dir = os.path.join(self.task_cache_dir, str(run_id))
+
+        try:
+            os.makedirs(run_cache_dir)
+        except (IOError, OSError):
+            # TODO add debug information!
+            pass
+        return run_cache_dir
+
+    def _create_run_from_xml(self, xml):
+        dic = xmltodict.parse(xml)[u"oml:run"]
+        datasets = []
+        for key in dic[u'oml:input_data']:
+            dataset = dic[u'oml:input_data'][key]
+            did = dataset[u'oml:did']
+            datasets.append(did)
+
+        tags = []
+        for tag in dic[u"oml:tag"]:
+            tags.append(tag)
+
+        files = dict()
+        for file_ in dic[u"oml:output_data"][u"oml:file"]:
+            name = file_[u"oml:name"]
+            url = file_[u"oml:url"]
+            files[name] = url
+
+        print dic.keys()
+        evaluations = dict()
+        for evaluation in dic[u"oml:output_data"][u"oml:evaluation"]:
+            name = evaluation[u"oml:name"]
+            value = evaluation.get(u"oml:value")
+            value_array = evaluation.get(u"oml:array_data")
+            evaluations[name] = (value, value_array)
+
+        return OpenMLRun(
+            dic[u"oml:run_id"], dic[u"oml:uploader"],
+            dic[u"oml:task_id"], dic[u"oml:implementation_id"],
+            dic[u"oml:setup_string"], dic[u'oml:setup_id'],
+            tags, datasets, files, evaluations)
+
+    ############################################################################
+    # Internal stuff
+    def _perform_api_call(self, call, data=None, file_path=None):
+        """Perform an API call at the OpenML server.
         return self._read_url(url, data=data, filePath=filePath,
         def _read_url(self, url, add_authentication=False, data=None, filePath=None):
 
@@ -840,21 +989,21 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
         if not url.endswith("/"):
             url += "/"
         url += call
-        return self._read_url(url, data=data,filePath= filePath)
+        return self._read_url(url, data=data, file_path=file_path)
 
-    def _read_url(self, url, data=None, filePath=None):
+    def _read_url(self, url, data=None, file_path=None):
         if data is None:
             data = {}
         data['session_hash'] = self.config.get('FAKE_SECTION', 'apikey')
 
-        if filePath is not None:
-            if os.path.isabs(filePath):
+        if file_path is not None:
+            if os.path.isabs(file_path):
                 try:
                     decoder = arff.ArffDecoder()
                 except:
                     raise "The file you provided is not a valid arff file"
 
-                fileElement={'dataset': open(filePath, 'rb')}
+                fileElement={'dataset': open(file_path, 'rb')}
                 data['description']= data.get('description')
                 data.pop('dataset', None)
 
@@ -866,6 +1015,7 @@ def _read_url(self, url, data=None, filePath=None):
                 return response.status_code, response
             else:
                 raise "File doesn't exists"
+
         else:
             data = urlencode(data)
             data = data.encode('utf-8')
@@ -907,10 +1057,11 @@ def _read_url(self, url, data=None, filePath=None):
                     string.write(chunk)
             return return_code, string.getvalue()
 
-    def upload_dataset(self, description, filePath=None):
+    def upload_dataset(self, description, file_path=None):
         try:
             data = {'description': description}
-            return_code, dataset_xml = self._perform_api_call("/data/",data=data, filePath=filePath)
+            return_code, dataset_xml = self._perform_api_call(
+                "/data/", data=data, file_path=file_path)
 
         except URLError as e:
             # TODO logger.debug
@@ -921,7 +1072,8 @@ def upload_dataset(self, description, filePath=None):
     def upload_flow(self, description, binary, source):
         try:
             data = {'description': description, 'binary': binary, 'source': source}
-            return_code, dataset_xml = self._perform_api_call("openml.implementation.upload", data=data)
+            return_code, dataset_xml = self._perform_api_call(
+                "openml.implementation.upload", data=data)
 
         except URLError as e:
             # TODO logger.debug
diff --git a/openml/entities/run.py b/openml/entities/run.py
@@ -0,0 +1,13 @@
+class OpenMLRun(object):
+    def __init__(self, run_id, uploader, task_id, flow_id, setup_string,
+                 setup_id, tags, datasets, files, evaluations):
+        self.run_id = run_id
+        self.uploader = uploader
+        self.task_id = task_id
+        self.flow_id = flow_id
+        self.setup_id = setup_id
+        self.setup_string = setup_string
+        self.tags = tags
+        self.datasets = datasets
+        self.files = files
+        self.evaluations = evaluations
diff --git a/source/progress.rst b/source/progress.rst
@@ -13,7 +13,6 @@ API calls
 API call                                        implemented tested properly tested loads json proper error handling
 =============================================== =========== ====== =============== ========== =====================
 /data/list/                                     yes         yes
-/data/list/active/
 /data/list/tag/{tag}
 /data/{data_id}                                 yes         yes
 /data/delete/
@@ -26,7 +25,6 @@ API call                                        implemented tested properly test
 /data/tag
 /data/untag
 /task/list                                      yes         yes
-/task/list/active
 /task/list/tag/{tag}
 /task/{task_id}                                 yes         yes
 /task/tag
@@ -40,8 +38,8 @@ API call                                        implemented tested properly test
 /flow/
 /flow/exists/{name,ext_version}
 /flow/owned
-/run/list
-/run/{run_id}
+/run/list                                      yes          yes
+/run/{run_id}                                  yes          yes
 /run
 /run/tag
 /run/untag
diff --git a/tests/test_apiconnector.py b/tests/test_apiconnector.py
@@ -221,6 +221,39 @@ def test_download_split(self):
         self.assertTrue(os.path.exists(
             os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))
 
+    ############################################################################
+    # Runs
+    def test_download_run_list(self):
+        def check_run(run):
+            self.assertIsInstance(run, dict)
+            self.assertEqual(len(run), 5)
+
+        runs = self.connector.get_runs_list(task_id=1)
+        # 1759 as the number of supervised classification tasks retrieved
+        # openml.org from this call; don't trust the number on openml.org as
+        # it also counts private datasets
+        self.assertGreaterEqual(len(runs), 800)
+        for run in runs:
+            check_run(run)
+
+        runs = self.connector.get_runs_list(flow_id=1)
+        self.assertGreaterEqual(len(runs), 1)
+        for task in runs:
+            check_run(task)
+
+        runs = self.connector.get_runs_list(setup_id=1)
+        self.assertGreaterEqual(len(runs), 261)
+        for task in runs:
+            check_run(task)
+
+    def test_download_run(self):
+        run = self.connector.download_run(473350)
+        self.assertGreaterEqual(len(run.tags), 2)
+        self.assertEqual(len(run.datasets), 1)
+        self.assertGreaterEqual(len(run.files), 2)
+        self.assertGreaterEqual(len(run.evaluations), 18)
+        self.assertEqual(len(run.evaluations['f_measure']), 2)
+
     def test_upload_dataset(self):
 
         dataset = self.connector.download_dataset(3)