implementation of run traces

janvanrijn · janvanrijn · commit 08f8ecbfc47e · 2017-05-05T16:46:15.000+02:00
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -51,6 +51,18 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
     return _read_url(url, data)
 
 
+def fileid_to_url(file_id, filename=None):
+    '''
+     Presents the URL how to download a given file id
+     filename is optional
+    '''
+    openml_url = config.server.split('/api/')
+    url = openml_url[0] + '/data/download/%s' %file_id
+    if filename is not None:
+        url += '/' + filename
+    return url
+
+
 def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
     """do a post request to url with data, file content of
     file_dictionary and sending file_elements as files"""
diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py
@@ -1,4 +1,6 @@
 from .run import OpenMLRun
-from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run)
+from .trace import OpenMLRunTrace, OpenMLTraceIteration
+from .functions import (run_task, get_run, list_runs, get_runs,
+                        initialize_model_from_run, initialize_model_from_trace)
 
 __all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs']
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -6,6 +6,8 @@
 import warnings
 import sklearn
 import time
+import six
+import json
 
 from ..exceptions import PyOpenMLError
 from .. import config
@@ -15,8 +17,9 @@
 
 from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..util import URLError, version_complies
-from .._api_calls import _perform_api_call
+from .._api_calls import _perform_api_call, fileid_to_url
 from .run import OpenMLRun, _get_version_information
+from .trace import OpenMLRunTrace, OpenMLTraceIteration
 
 
 # _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
@@ -112,6 +115,45 @@ def initialize_model_from_run(run_id):
     run = get_run(run_id)
     return initialize_model(run.setup_id)
 
+def initialize_model_from_trace(run_id, repeat, fold, iteration=None):
+    '''
+    Initialized a model based on the parameters that were set
+    by an optimization procedure (i.e., using the exact same
+    parameter settings)
+
+    Parameters
+        ----------
+        run_id : int
+            The Openml run_id. Should contain a trace file
+
+        repeat: int
+            The repeat nr (column in trace file)
+
+        fold: int
+            The fold nr (column in trace file)
+
+        iteration: int
+            The iteration nr (column in trace file)
+
+        Returns
+        -------
+        model : sklearn model
+            the scikitlearn model with all parameters initailized
+        '''
+    run = get_run(run_id)
+    if 'trace' not in run.output_files:
+        raise PyOpenMLError('Run does not contain trace file')
+    trace_url = fileid_to_url(run.output_files['trace'], 'trace.arff')
+    #print(trace_url)
+    trace_xml = _perform_api_call('run/trace/%d' %run_id)
+    run_trace = _create_trace_from_description(trace_xml)
+
+    request = (repeat, fold, iteration)
+    if request not in run_trace.trace_iterations:
+        raise ValueError('Combination repeat, fold, iteration not availavle')
+    current = run_trace.trace_iterations[(repeat, fold, iteration)]
+    
+
 def _run_exists(task_id, setup_id):
     '''
     Checks whether a task/setup combination is already present on the server.
@@ -306,7 +348,7 @@ def _extract_arfftrace(model, rep_no, fold_no):
         arff_line = [rep_no, fold_no, itt_no, test_score, selected]
         for key in model.cv_results_:
             if key.startswith("param_"):
-                arff_line.append(str(model.cv_results_[key][itt_no]))
+                arff_line.append(sklearn_to_flow(model.cv_results_[key][itt_no]))
         arff_tracecontent.append(arff_line)
     return arff_tracecontent
 
@@ -326,15 +368,20 @@ def _extract_arfftrace_attributes(model):
 
     # model dependent attributes for trace arff
     for key in model.cv_results_:
-        if key.startswith("param_"):
+        if key.startswith('param_'):
+            # supported types should include all types, including bool, int float
+            supported_types = (bool, int, float, six.string_types)
             if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
                 type = ['True', 'False']
             elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
                 type = 'NUMERIC'
+            elif all(isinstance(i, supported_types) or i is None for i in model.cv_results_[key]):
+                type = 'STRING'
             else:
-                values = list(set(model.cv_results_[key]))  # unique values
-                type = [str(i) for i in values]
+                raise TypeError('Unsupported param type in param grid')
 
+            # we renamed the attribute param to parameter, as this is a required
+            # OpenML convention
             attribute = ("parameter_" + key[6:], type)
             trace_attributes.append(attribute)
     return trace_attributes
@@ -439,45 +486,52 @@ def _create_run_from_xml(xml):
 
     dataset_id = int(run['oml:input_data']['oml:dataset']['oml:did'])
 
-    predictions_url = None
-    if isinstance(run['oml:output_data']['oml:file'], dict):
-        # only one result.. probably due to an upload error
-        file_dict = run['oml:output_data']['oml:file']
-        if file_dict['oml:name'] == 'predictions':
-            predictions_url = file_dict['oml:url']
-    else:
-        # multiple files, the normal case
-        for file_dict in run['oml:output_data']['oml:file']:
-            if file_dict['oml:name'] == 'predictions':
-                predictions_url = file_dict['oml:url']
-    if predictions_url is None:
-        raise ValueError('No URL to download predictions for run %d in run '
-                         'description XML' % run_id)
+    files = dict()
     evaluations = dict()
     detailed_evaluations = defaultdict(lambda: defaultdict(dict))
-    evaluation_flows = dict()
-    if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
-        for evaluation_dict in run['oml:output_data']['oml:evaluation']:
-            key = evaluation_dict['oml:name']
-            if 'oml:value' in evaluation_dict:
-                value = float(evaluation_dict['oml:value'])
-            elif 'oml:array_data' in evaluation_dict:
-                value = evaluation_dict['oml:array_data']
-            else:
-                raise ValueError('Could not find keys "value" or "array_data" '
-                                 'in %s' % str(evaluation_dict.keys()))
-
-            if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
-                repeat = int(evaluation_dict['@repeat'])
-                fold = int(evaluation_dict['@fold'])
-                repeat_dict = detailed_evaluations[key]
-                fold_dict = repeat_dict[repeat]
-                fold_dict[fold] = value
-            else:
-                evaluations[key] = value
-                evaluation_flows[key] = flow_id
+    if 'oml:output_data' not in run:
+        raise ValueError('Run does not contain output_data (OpenML server error?)')
+    else:
+        if isinstance(run['oml:output_data']['oml:file'], dict):
+            # only one result.. probably due to an upload error
+            file_dict = run['oml:output_data']['oml:file']
+            files[file_dict['oml:name']] = int(file_dict['oml:file_id'])
+        else:
+            # multiple files, the normal case
+            for file_dict in run['oml:output_data']['oml:file']:
+                files[file_dict['oml:name']] = int(file_dict['oml:file_id'])
+        if 'oml:evaluation' in run['oml:output_data']:
+            # in normal cases there should be evaluations, but in case there
+            # was an error these could be absent
+            for evaluation_dict in run['oml:output_data']['oml:evaluation']:
+                key = evaluation_dict['oml:name']
+                if 'oml:value' in evaluation_dict:
+                    value = float(evaluation_dict['oml:value'])
+                elif 'oml:array_data' in evaluation_dict:
+                    value = evaluation_dict['oml:array_data']
+                else:
+                    raise ValueError('Could not find keys "value" or "array_data" '
+                                     'in %s' % str(evaluation_dict.keys()))
+
+                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
+                    repeat = int(evaluation_dict['@repeat'])
+                    fold = int(evaluation_dict['@fold'])
+                    repeat_dict = detailed_evaluations[key]
+                    fold_dict = repeat_dict[repeat]
+                    fold_dict[fold] = value
+                else:
+                    evaluations[key] = value
+
+    if 'description' not in files:
+        raise ValueError('No description file for run %d in run '
+                         'description XML' % run_id)
+
+    if 'predictions' not in files:
+        # JvR: actually, I am not sure whether this error should be raised.
+        # a run can consist without predictions. But for now let's keep it
+        raise ValueError('No prediction files for run %d in run '
+                         'description XML' % run_id)
 
-            evaluation_flows[key] = flow_id
     tags = None
     if 'oml:tag' in run:
         if isinstance(run['oml:tag'], str):
@@ -487,18 +541,40 @@ def _create_run_from_xml(xml):
         else:
             raise ValueError('Received not string and non list as tag item')
 
-
     return OpenMLRun(run_id=run_id, uploader=uploader,
                      uploader_name=uploader_name, task_id=task_id,
                      task_type=task_type,
                      task_evaluation_measure=task_evaluation_measure,
                      flow_id=flow_id, flow_name=flow_name,
                      setup_id=setup_id, setup_string=setup_string,
                      parameter_settings=parameters,
-                     dataset_id=dataset_id, predictions_url=predictions_url,
+                     dataset_id=dataset_id, output_files=files,
                      evaluations=evaluations,
                      detailed_evaluations=detailed_evaluations, tags=tags)
 
+def _create_trace_from_description(xml):
+    result_dict = xmltodict.parse(xml)['oml:trace']
+
+    run_id = result_dict['oml:run_id']
+    trace = dict()
+
+    if 'oml:trace_iteration' not in result_dict:
+        raise ValueError('Run does not contain valid trace. ')
+
+    for itt in result_dict['oml:trace_iteration']:
+        repeat = int(itt['oml:repeat'])
+        fold = int(itt['oml:fold'])
+        iteration = int(itt['oml:iteration'])
+        setup_string = json.loads(itt['oml:setup_string'])
+        evaluation = float(itt['oml:evaluation'])
+        selected = bool(itt['oml:selected'])
+
+        current = OpenMLTraceIteration(repeat, fold, iteration,
+                                        setup_string, evaluation,
+                                        selected)
+        trace[(repeat, fold, iteration)] = current
+
+    return OpenMLRunTrace(run_id, trace)
 
 def _get_cached_run(run_id):
     """Load a run from the cache."""
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -4,7 +4,6 @@
 
 import arff
 import xmltodict
-from sklearn.base import BaseEstimator
 
 import openml
 from ..tasks import get_task
@@ -20,7 +19,7 @@ class OpenMLRun(object):
 
     """
     def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
-                 files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
+                 output_files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
                  evaluations=None, detailed_evaluations=None,
                  data_content=None, trace_attributes=None, trace_content=None,
                  model=None, task_type=None, task_evaluation_measure=None, flow_name=None,
@@ -37,10 +36,10 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
         self.setup_string = setup_string
         self.parameter_settings = parameter_settings
         self.dataset_id = dataset_id
-        self.predictions_url = predictions_url
         self.evaluations = evaluations
         self.detailed_evaluations = detailed_evaluations
         self.data_content = data_content
+        self.output_files = output_files
         self.trace_attributes = trace_attributes
         self.trace_content = trace_content
         self.error_message = None
diff --git a/openml/runs/trace.py b/openml/runs/trace.py
@@ -0,0 +1,39 @@
+
+
+class OpenMLRunTrace(object):
+    """OpenML Run Trace: parsed output from Run Trace call
+
+    Parameters
+    ----------
+    FIXME
+
+    """
+
+    def __init__(self, run_id, trace_iterations):
+        self.run_id = run_id
+        self.trace_iterations = trace_iterations
+
+    def __str__(self):
+        return '[Run id: %d, %d trace iterations]' %(self.run_id, len(self.trace_iterations))
+
+
+class OpenMLTraceIteration(object):
+    """OpenML Trace Iteration: parsed output from Run Trace call
+
+    Parameters
+    ----------
+    FIXME
+    """
+
+    def __init__(self, repeat, fold, iteration, setup_string, evaluation, selected):
+        self.repeat = repeat
+        self.fold = fold
+        self.iteration = iteration
+        self.setup_string = setup_string
+        self.evaluation = evaluation
+        self.selected = selected
+
+    def __str__(self):
+        return '[(%d,%d,%d): %f (%r)]' %(self.repeat, self.fold, self.iteration,
+                                          self.evaluation, self.selected)
+