Merge pull request #670 from openml/pyproject

PGijsbers · web-flow · commit 2a468f9f58e8 · 2019-04-16T10:16:14.000+03:00
Enable pip install from clean
diff --git a/appveyor.yml b/appveyor.yml
@@ -34,11 +34,8 @@ install:
 
   # Install the build and runtime dependencies of the project.
   - "cd C:\\projects\\openml-python"
-  - conda install --quiet --yes scikit-learn=0.20.0 nb_conda nb_conda_kernels numpy scipy requests nbformat python-dateutil nbconvert pandas matplotlib seaborn
-  - pip install liac-arff xmltodict oslo.concurrency
-  # Packages for (parallel) unit tests with pytest
-  - pip install pytest pytest-xdist pytest-timeout
-  - "pip install .[test]"
+  - "pip install .[examples,test]"
+  - conda install --quiet --yes scikit-learn=0.20.0
 
 
 # Not a .NET project, we build scikit-learn in the install step instead
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
@@ -26,15 +26,17 @@ popd
 # provided versions
 conda create -n testenv --yes python=$PYTHON_VERSION pip
 source activate testenv
-pip install pytest pytest-xdist pytest-timeout numpy scipy cython scikit-learn==$SKLEARN_VERSION \
-    oslo.concurrency
+
+python --version
+pip install -e '.[test]'
+python -c "import numpy; print('numpy %s' % numpy.__version__)"
+python -c "import scipy; print('scipy %s' % scipy.__version__)"
 
 if [[ "$EXAMPLES" == "true" ]]; then
-    pip install matplotlib jupyter notebook nbconvert nbformat jupyter_client \
-        ipython ipykernel pandas seaborn
+    pip install -e '.[examples]'
 fi
 if [[ "$DOCTEST" == "true" ]]; then
-    pip install pandas sphinx_bootstrap_theme
+    pip install sphinx_bootstrap_theme
 fi
 if [[ "$COVERAGE" == "true" ]]; then
     pip install codecov pytest-cov
@@ -43,7 +45,6 @@ if [[ "$RUN_FLAKE8" == "true" ]]; then
     pip install flake8 mypy
 fi
 
-python --version
-python -c "import numpy; print('numpy %s' % numpy.__version__)"
-python -c "import scipy; print('scipy %s' % scipy.__version__)"
-pip install -e '.[test]'
+# Install scikit-learn last to make sure the openml package installation works
+# from a clean environment without scikit-learn.
+pip install scikit-learn==$SKLEARN_VERSION
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -564,7 +564,7 @@ def get_data(self, target: Optional[Union[List[str], str]] = None,
         else:
             return rval
 
-    def retrieve_class_labels(self, target_name='class'):
+    def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[str]]:
         """Reads the datasets arff to determine the class-labels.
 
         If the task has no class labels (for example a regression problem)
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -388,7 +388,7 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
                              (flow_id, message))
         return self
 
-    def get_structure(self, key_item):
+    def get_structure(self, key_item: str) -> Dict[str, List[str]]:
         """
         Returns for each sub-component of the flow the path of identifiers
         that should be traversed to reach this component. The resulting dict
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -12,7 +12,13 @@
 import openml._api_calls
 from ..exceptions import PyOpenMLError
 from ..flows import get_flow
-from ..tasks import get_task, TaskTypeEnum
+from ..tasks import (get_task,
+                     TaskTypeEnum,
+                     OpenMLClassificationTask,
+                     OpenMLLearningCurveTask,
+                     OpenMLClusteringTask,
+                     OpenMLRegressionTask
+                     )
 from ..utils import _tag_entity
 
 
@@ -69,7 +75,7 @@ def _repr_pretty_(self, pp, cycle):
         pp.text(str(self))
 
     @classmethod
-    def from_filesystem(cls, directory, expect_model=True):
+    def from_filesystem(cls, directory: str, expect_model: bool = True) -> 'OpenMLRun':
         """
         The inverse of the to_filesystem method. Instantiates an OpenMLRun
         object based on files stored on the file system.
@@ -109,24 +115,24 @@ def from_filesystem(cls, directory, expect_model=True):
         if not os.path.isfile(model_path) and expect_model:
             raise ValueError('Could not find model.pkl')
 
-        with open(description_path, 'r') as fp:
-            xml_string = fp.read()
+        with open(description_path, 'r') as fht:
+            xml_string = fht.read()
         run = openml.runs.functions._create_run_from_xml(xml_string, from_server=False)
 
         if run.flow_id is None:
             flow = openml.flows.OpenMLFlow.from_filesystem(directory)
             run.flow = flow
             run.flow_name = flow.name
 
-        with open(predictions_path, 'r') as fp:
-            predictions = arff.load(fp)
+        with open(predictions_path, 'r') as fht:
+            predictions = arff.load(fht)
             run.data_content = predictions['data']
 
         if os.path.isfile(model_path):
             # note that it will load the model if the file exists, even if
             # expect_model is False
-            with open(model_path, 'rb') as fp:
-                run.model = pickle.load(fp)
+            with open(model_path, 'rb') as fhb:
+                run.model = pickle.load(fhb)
 
         if os.path.isfile(trace_path):
             run.trace = openml.runs.OpenMLRunTrace._from_filesystem(trace_path)
@@ -209,7 +215,18 @@ def _generate_arff_dict(self) -> 'OrderedDict[str, Any]':
         arff_dict['relation'] =\
             'openml_task_{}_predictions'.format(task.task_id)
 
-        if task.task_type_id == TaskTypeEnum.SUPERVISED_CLASSIFICATION:
+        if isinstance(task, OpenMLLearningCurveTask):
+            class_labels = task.class_labels  # type: ignore
+            arff_dict['attributes'] = [('repeat', 'NUMERIC'),
+                                       ('fold', 'NUMERIC'),
+                                       ('sample', 'NUMERIC'),
+                                       ('row_id', 'NUMERIC')] + \
+                                      [('confidence.' + class_labels[i],
+                                        'NUMERIC') for i in
+                                       range(len(class_labels))] + \
+                                      [('prediction', class_labels),
+                                       ('correct', class_labels)]
+        elif isinstance(task, OpenMLClassificationTask):
             class_labels = task.class_labels
             instance_specifications = [('repeat', 'NUMERIC'),
                                        ('fold', 'NUMERIC'),
@@ -223,27 +240,14 @@ def _generate_arff_dict(self) -> 'OrderedDict[str, Any]':
             arff_dict['attributes'] = (instance_specifications
                                        + prediction_confidences
                                        + prediction_and_true)
-
-        elif task.task_type_id == TaskTypeEnum.LEARNING_CURVE:
-            class_labels = task.class_labels
-            arff_dict['attributes'] = [('repeat', 'NUMERIC'),
-                                       ('fold', 'NUMERIC'),
-                                       ('sample', 'NUMERIC'),
-                                       ('row_id', 'NUMERIC')] + \
-                                      [('confidence.' + class_labels[i],
-                                        'NUMERIC') for i in
-                                       range(len(class_labels))] + \
-                                      [('prediction', class_labels),
-                                       ('correct', class_labels)]
-
-        elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION:
+        elif isinstance(task, OpenMLRegressionTask):
             arff_dict['attributes'] = [('repeat', 'NUMERIC'),
                                        ('fold', 'NUMERIC'),
                                        ('row_id', 'NUMERIC'),
                                        ('prediction', 'NUMERIC'),
                                        ('truth', 'NUMERIC')]
 
-        elif task.task_type == TaskTypeEnum.CLUSTERING:
+        elif isinstance(task, OpenMLClusteringTask):
             arff_dict['attributes'] = [('repeat', 'NUMERIC'),
                                        ('fold', 'NUMERIC'),
                                        ('row_id', 'NUMERIC'),
@@ -461,7 +465,7 @@ def _create_description_xml(self):
         description_xml = xmltodict.unparse(description, pretty=True)
         return description_xml
 
-    def push_tag(self, tag):
+    def push_tag(self, tag: str) -> None:
         """Annotates this run with a tag on the server.
 
         Parameters
@@ -471,7 +475,7 @@ def push_tag(self, tag):
         """
         _tag_entity('run', self.run_id, tag)
 
-    def remove_tag(self, tag):
+    def remove_tag(self, tag: str) -> None:
         """Removes a tag from this run on the server.
 
         Parameters
diff --git a/openml/runs/trace.py b/openml/runs/trace.py
@@ -32,7 +32,7 @@ def __init__(self, run_id, trace_iterations):
         self.run_id = run_id
         self.trace_iterations = trace_iterations
 
-    def get_selected_iteration(self, fold, repeat):
+    def get_selected_iteration(self, fold: int, repeat: int) -> int:
         """
         Returns the trace iteration that was marked as selected. In
         case multiple are marked as selected (should not happen) the
@@ -46,7 +46,7 @@ def get_selected_iteration(self, fold, repeat):
 
         Returns
         ----------
-        OpenMLTraceIteration
+        int
             The trace iteration from the given fold and repeat that was
             selected as the best iteration by the search procedure
         """
@@ -104,7 +104,7 @@ def generate(cls, attributes, content):
         )
 
     @classmethod
-    def _from_filesystem(cls, file_path):
+    def _from_filesystem(cls, file_path: str) -> 'OpenMLRunTrace':
         """
         Logic to deserialize the trace from the filesystem.
 
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -19,7 +19,8 @@
     OpenMLLearningCurveTask,
     TaskTypeEnum,
     OpenMLRegressionTask,
-    OpenMLSupervisedTask
+    OpenMLSupervisedTask,
+    OpenMLTask
 )
 import openml.utils
 import openml._api_calls
@@ -54,7 +55,7 @@ def _get_cached_tasks():
     return tasks
 
 
-def _get_cached_task(tid):
+def _get_cached_task(tid: int) -> OpenMLTask:
     """Return a cached task based on the given id.
 
     Parameters
@@ -299,7 +300,7 @@ def get_tasks(task_ids, download_data=True):
     return tasks
 
 
-def get_task(task_id, download_data=True):
+def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
     """Download OpenML task for a given task ID.
 
     Downloads the task representation, while the data splits can be
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
@@ -58,7 +58,7 @@ def __eq__(self, other):
         return True
 
     @classmethod
-    def _from_arff_file(cls, filename):
+    def _from_arff_file(cls, filename: str) -> 'OpenMLSplit':
 
         repetitions = None
 
diff --git a/setup.py b/setup.py
@@ -6,20 +6,18 @@
 with open("openml/__version__.py") as fh:
     version = fh.readlines()[-1].split()[-1].strip("\"'")
 
-dependency_links = []
-
-try:
-    import numpy  # noqa: F401
-except ImportError:
-    print('numpy is required during installation')
-    sys.exit(1)
-
-try:
-    import scipy  # noqa: F401
-except ImportError:
-    print('scipy is required during installation')
+# Using Python setup.py install will try to build numpy which is prone to failure and
+# very time consuming anyway.
+if len(sys.argv) > 1 and sys.argv[1] == 'install':
+    print('Please install this package with pip: `pip install -e .` '
+          'Installation requires pip>=10.0.')
     sys.exit(1)
 
+if sys.version_info < (3, 5):
+    raise ValueError(
+        'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.'
+        .format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
+    )
 
 setuptools.setup(name="openml",
                  author="Matthias Feurer, Andreas Müller, Farzan Majdani, "
@@ -30,12 +28,14 @@
                  description="Python API for OpenML",
                  license="BSD 3-clause",
                  url="http://openml.org/",
+                 project_urls={
+                     "Documentation": "https://openml.github.io/openml-python/master/",
+                     "Source Code": "https://github.com/openml/openml-python"
+                 },
                  version=version,
                  packages=setuptools.find_packages(),
                  package_data={'': ['*.txt', '*.md']},
                  install_requires=[
-                     'numpy>=1.6.2',
-                     'scipy>=0.13.3',
                      'liac-arff>=2.2.2',
                      'xmltodict',
                      'pytest',
@@ -45,12 +45,29 @@
                      'python-dateutil',
                      'oslo.concurrency',
                      'pandas>=0.19.2',
+                     'scipy>=0.13.3',
+                     'numpy>=1.6.2'
                  ],
                  extras_require={
                      'test': [
                          'nbconvert',
                          'jupyter_client',
-                         'matplotlib'
+                         'matplotlib',
+                         'pytest',
+                         'pytest-xdist',
+                         'pytest-timeout',
+
+                     ],
+                     'examples': [
+                         'matplotlib',
+                         'jupyter',
+                         'notebook',
+                         'nbconvert',
+                         'nbformat',
+                         'jupyter_client',
+                         'ipython',
+                         'ipykernel',
+                         'seaborn'
                      ]
                  },
                  test_suite="pytest",
@@ -66,5 +83,5 @@
                               'Programming Language :: Python :: 3',
                               'Programming Language :: Python :: 3.4',
                               'Programming Language :: Python :: 3.5',
-                              'Programming Language :: Python :: 3.6'
+                              'Programming Language :: Python :: 3.6',
                               'Programming Language :: Python :: 3.7'])