Merge branch 'develop' into runtests

janvanrijn · web-flow · commit 2975384be6f7 · 2017-05-13T17:10:24.000+02:00
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -1,8 +1,9 @@
 import io
 import os
 import requests
-import arff
 import warnings
+
+import arff
 import xmltodict
 
 from . import config
diff --git a/openml/config.py b/openml/config.py
@@ -1,9 +1,12 @@
 """
 Stores module level information like the API key, cache directory and the server.
 """
-import os
-import sys
 import logging
+import os
+
+from six import StringIO
+from six.moves import configparser
+
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
@@ -15,12 +18,7 @@
 cachedir = ""
 
 
-if sys.version_info[0] < 3:
-    import ConfigParser as configparser
-    from StringIO import StringIO
-else:
-    import configparser
-    from io import StringIO
+
 
 
 def _setup():
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -9,21 +9,11 @@
 
 import numpy as np
 import scipy.sparse
+from six.moves import cPickle as pickle
 import xmltodict
 
 from .data_feature import OpenMLDataFeature
 from ..exceptions import PyOpenMLError
-
-if sys.version_info[0] >= 3:
-    import pickle
-else:
-    try:
-        import cPickle as pickle
-    except:
-        import pickle
-
-
-from ..util import is_string
 from .._api_calls import _perform_api_call
 
 logger = logging.getLogger(__name__)
@@ -219,7 +209,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
             if not self.row_id_attribute:
                 pass
             else:
-                if is_string(self.row_id_attribute):
+                if isinstance(self.row_id_attribute, six.string_types):
                     to_exclude.append(self.row_id_attribute)
                 else:
                     to_exclude.extend(self.row_id_attribute)
@@ -243,7 +233,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
         if target is None:
             rval.append(data)
         else:
-            if is_string(target):
+            if isinstance(target, six.string_types):
                 target = [target]
             targets = np.array([True if column in target else False
                                 for column in attribute_names])
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1,9 +1,11 @@
+from collections import OrderedDict
 import io
 import os
 import re
 import shutil
-from collections import OrderedDict
+
 import xmltodict
+
 from .dataset import OpenMLDataset
 from ..exceptions import OpenMLCacheException
 from .. import config
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -1,22 +1,21 @@
 from collections import defaultdict
 import io
+import json
 import os
-import xmltodict
-import numpy as np
+import sys
+import time
 import warnings
+
+import numpy as np
 import sklearn
-import time
 import six
-import json
+import xmltodict
 
 from ..exceptions import PyOpenMLError
 from .. import config
-
 from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
 from ..setups import setup_exists, initialize_model
-
 from ..exceptions import OpenMLCacheException, OpenMLServerException
-from ..util import URLError, version_complies
 from .._api_calls import _perform_api_call, _file_id_to_url
 from .run import OpenMLRun, _get_version_information
 from .trace import OpenMLRunTrace, OpenMLTraceIteration
@@ -26,7 +25,6 @@
 # circular imports
 
 
-
 def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None):
     """Performs a CV run on the dataset of the given task, using the split.
 
@@ -303,7 +301,9 @@ def _run_task_get_arffcontent(model, task, class_labels):
     user_defined_measures = defaultdict(lambda: defaultdict(dict))
 
     rep_no = 0
-    can_measure_runtime = version_complies(3, 3) and _check_n_jobs(model)
+    # sys.version_info returns a tuple, the following line compares the entry of tuples
+    # https://docs.python.org/3.6/reference/expressions.html#value-comparisons
+    can_measure_runtime = sys.version_info[:2] >= (3, 3) and _check_n_jobs(model)
     # TODO use different iterator to only provide a single iterator (less
     # methods, less maintenance, less confusion)
     for rep in task.iterate_repeats():
@@ -465,23 +465,13 @@ def get_run(run_id):
 
     try:
         return _get_cached_run(run_id)
-    except (OpenMLCacheException):
-        try:
-            run_xml = _perform_api_call("run/%d" % run_id)
-        except (URLError, UnicodeEncodeError) as e:
-            # TODO logger.debug
-            print(e)
-            raise e
 
+    except (OpenMLCacheException):
+        run_xml = _perform_api_call("run/%d" % run_id)
         with io.open(run_file, "w", encoding='utf8') as fh:
             fh.write(run_xml)
 
-    try:
-        run = _create_run_from_xml(run_xml)
-    except Exception as e:
-        # TODO logger.debug
-        print("Run ID", run_id)
-        raise e
+    run = _create_run_from_xml(run_xml)
 
     with io.open(run_file, "w", encoding='utf8') as fh:
         fh.write(run_xml)
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -1,10 +1,11 @@
+from collections import OrderedDict
+
 import openml
 import xmltodict
-import copy
 
-from collections import OrderedDict
 from .setup import OpenMLSetup, OpenMLParameter
 
+
 def setup_exists(downloaded_flow, sklearn_model):
     '''
     Checks whether a flow / hyperparameter configuration already exists on the server
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -1,10 +1,10 @@
+from collections import OrderedDict
 import io
-import os
 import re
-from collections import OrderedDict
+import os
+
 import xmltodict
 
-from ..util import URLError
 from ..exceptions import OpenMLCacheException
 from .. import datasets
 from .task import OpenMLTask, _create_task_cache_dir
@@ -103,12 +103,11 @@ def list_tasks(task_type_id=None, offset=None, size=None, tag=None):
 
     Returns
     -------
-    list
-        A list of all tasks having the given task_type_id and the give tag.
-        Every task is represented by a dictionary containing the following
-        information: task id, dataset id, task_type and status. If qualities
-        are calculated for the associated dataset, some of these are also
-        returned.
+    dict
+        All tasks having the given task_type_id and the give tag. Every task is
+        represented by a dictionary containing the following information:
+        task id, dataset id, task_type and status. If qualities are calculated
+        for the associated dataset, some of these are also returned.
     """
     api_call = "task/list"
     if task_type_id is not None:
@@ -146,7 +145,7 @@ def _list_tasks(api_call):
                          % str(tasks_dict))
 
     try:
-        tasks = dict();
+        tasks = dict()
         procs = _get_estimation_procedure_list()
         proc_dict = dict((x['id'], x) for x in procs)
         for task_ in tasks_dict['oml:tasks']['oml:task']:
@@ -199,13 +198,9 @@ def get_task(task_id):
     try:
         with io.open(xml_file, encoding='utf8') as fh:
             task = _create_task_from_xml(fh.read())
-    except (OSError, IOError):
 
-        try:
-            task_xml = _perform_api_call("task/%d" % task_id)
-        except (URLError, UnicodeEncodeError) as e:
-            print(e)
-            raise e
+    except (OSError, IOError):
+        task_xml = _perform_api_call("task/%d" % task_id)
 
         with io.open(xml_file, "w", encoding='utf8') as fh:
             fh.write(task_xml)
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
@@ -1,16 +1,10 @@
 from collections import namedtuple, OrderedDict
 import os
 import sys
+
 import numpy as np
 import scipy.io.arff
-
-if sys.version_info[0] > 3:
-    import pickle
-else:
-    try:
-        import cPickle as pickle
-    except:
-        import pickle
+from six.moves import cPickle as pickle
 
 
 Split = namedtuple("Split", ["train", "test"])
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -3,7 +3,6 @@
 
 from .. import config
 from .. import datasets
-from ..util import URLError
 from .split import OpenMLSplit
 from .._api_calls import _read_url
 
@@ -70,11 +69,7 @@ def _download_split(self, cache_file):
                 pass
         except (OSError, IOError):
             split_url = self.estimation_procedure["data_splits_url"]
-            try:
-                split_arff = _read_url(split_url)
-            except (URLError, UnicodeEncodeError) as e:
-                print(e, split_url)
-                raise e
+            split_arff = _read_url(split_url)
 
             with io.open(cache_file, "w", encoding='utf8') as fh:
                 fh.write(split_arff)
diff --git a/openml/testing.py b/openml/testing.py
@@ -1,9 +1,10 @@
 import hashlib
 import inspect
 import os
-import time
 import shutil
+import time
 import unittest
+
 import openml
 
 
diff --git a/openml/util.py b/openml/util.py
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -3,9 +3,9 @@
 import os
 
 import numpy as np
+import six
 
 from openml import OpenMLDataset
-from openml.util import is_string
 
 
 class OpenMLDatasetTest(unittest.TestCase):
@@ -64,7 +64,8 @@ def test_get_data(self):
         rval, attribute_names = self.dataset.get_data(
             return_attribute_names=True)
         self.assertEqual(len(attribute_names), 39)
-        self.assertTrue(all([is_string(att) for att in attribute_names]))
+        self.assertTrue(all([isinstance(att, six.string_types)
+                             for att in attribute_names]))
 
     def test_get_sparse_dataset(self):
         rval = self.sparse_dataset.get_data()
@@ -80,7 +81,8 @@ def test_get_sparse_dataset(self):
             return_attribute_names=True)
         self.assertIsInstance(rval, np.ndarray)
         self.assertEqual(len(attribute_names), 20001)
-        self.assertTrue(all([is_string(att) for att in attribute_names]))
+        self.assertTrue(all([isinstance(att, six.string_types)
+                             for att in attribute_names]))
 
     def test_get_data_with_target(self):
         X, y = self.dataset.get_data(target="class")
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -8,12 +8,12 @@
 else:
     import mock
 
+import six
 import scipy.sparse
 
 import openml
 from openml import OpenMLDataset
 from openml.exceptions import OpenMLCacheException, PyOpenMLError
-from openml.util import is_string
 from openml.testing import TestBase
 
 from openml.datasets.functions import (_get_cached_dataset,
@@ -98,7 +98,7 @@ def _check_dataset(self, dataset):
             self.assertIn('did', dataset)
             self.assertIsInstance(dataset['did'], int)
             self.assertIn('status', dataset)
-            self.assertTrue(is_string(dataset['status']))
+            self.assertIsInstance(dataset['status'], six.string_types)
             self.assertIn(dataset['status'], ['in_preparation', 'active',
                                               'deactivated'])
 
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py