pep8 and better docstrings

mfeurer · mfeurer · commit 8abfb23163f1 · 2019-04-17T20:16:10.000+02:00
diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 import scipy.sparse
-import pandas as pd
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
@@ -151,49 +150,50 @@ def _run_model_on_fold(
         self,
         model: Any,
         task: 'OpenMLTask',
-        X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame],
-        y_train: np.ndarray,
+        X_train: Union[np.ndarray, scipy.sparse.spmatrix],
         rep_no: int,
         fold_no: int,
-        X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-        n_classes: Optional[int] = None,
-    ) -> Tuple[List[List], List[List], 'OrderedDict[str, float]', Optional['OpenMLRunTrace']]:
+        y_train: Optional[np.ndarray] = None,
+        X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] = None,
+        classes: Optional[List] = None,
+    ) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Any]:
         """Run a model on a repeat,fold,subsample triplet of the task and return prediction information.
 
         Returns the data that is necessary to construct the OpenML Run object. Is used by
-        run_task_get_arff_content.
+        :func:`openml.runs.run_flow_on_task`.
 
         Parameters
         ----------
         model : Any
             The UNTRAINED model to run. The model instance will be copied and not altered.
         task : OpenMLTask
             The task to run the model on.
+        X_train : array-like
+            Training data for the given repetition and fold.
         rep_no : int
             The repeat of the experiment (0-based; in case of 1 time CV, always 0)
         fold_no : int
             The fold nr of the experiment (0-based; in case of holdout, always 0)
-        sample_no : int
-            In case of learning curves, the index of the subsample (0-based; in case of no
-            learning curve, always 0)
-        add_local_measures : bool
-            Determines whether to calculate a set of measures (i.e., predictive accuracy) locally,
-            to later verify server behaviour.
+        y_train : Optional[np.ndarray] (default=None)
+            Target attributes for supervised tasks. In case of classification, these are integer
+            indices to the potential classes specified by dataset.
+        X_test : Optional, array-like (default=None)
+            Test attributes to test for generalization in supervised tasks.
+        classes : List
+            List of classes for supervised classification tasks (and supervised data stream
+            classification).
 
         Returns
         -------
-        arff_datacontent : List[List]
-            Arff representation (list of lists) of the predictions that were
-            generated by this fold (required to populate predictions.arff)
-        arff_tracecontent :  List[List]
-            Arff representation (list of lists) of the trace data that was generated by this fold
-            (will be used to populate trace.arff, leave it empty if the model did not perform any
-            hyperparameter optimization).
+        predictions : np.ndarray
+            Model predictions.
+        probabilities :  Optional, np.ndarray
+            Predicted probabilities (only applicable for supervised classification tasks).
         user_defined_measures : OrderedDict[str, float]
             User defined measures that were generated on this fold
-        model : Any
-            The model trained on this repeat,fold,subsample triple. Will be used to generate trace
-            information later on (in ``obtain_arff_trace``).
+        trace : Optional, OpenMLRunTrace
+            Hyperparameter optimization trace (only applicable for supervised tasks with
+            hyperparameter optimization).
         """
 
     @abstractmethod
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -95,7 +95,7 @@ def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = Fal
 
         Parameters
         ----------
-        o : mixed
+        flow : mixed
             the object to deserialize (can be flow object, or any serialized
             parameter value that is accepted by)
 
@@ -470,7 +470,7 @@ def _check_multiple_occurence_of_component_in_flow(
     ) -> None:
         to_visit_stack = []  # type: List[OpenMLFlow]
         to_visit_stack.extend(sub_components.values())
-        known_sub_components = set()  # type: Set[OpenMLFlow]
+        known_sub_components = set()  # type: Set[str]
         while len(to_visit_stack) > 0:
             visitee = to_visit_stack.pop()
             if visitee.name in known_sub_components:
@@ -1103,7 +1103,7 @@ def _run_model_on_fold(
         fold_no: int,
         y_train: Optional[np.ndarray] = None,
         X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-        classes: Optional[int] = None,
+        classes: Optional[List] = None,
     ) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Any]:
         """Run a model on a repeat,fold,subsample triplet of the task and return prediction
         information.
@@ -1123,17 +1123,12 @@ def _run_model_on_fold(
             The UNTRAINED model to run. The model instance will be copied and not altered.
         task : OpenMLTask
             The task to run the model on.
+        X_train : array-like
+            Training data for the given repetition and fold.
         rep_no : int
             The repeat of the experiment (0-based; in case of 1 time CV, always 0)
         fold_no : int
             The fold nr of the experiment (0-based; in case of holdout, always 0)
-        sample_no : int
-            In case of learning curves, the index of the subsample (0-based; in case of no
-            learning curve, always 0)
-        add_local_measures : bool
-            Determines whether to calculate a set of measures (i.e., predictive accuracy)
-            locally,
-            to later verify server behaviour.
 
         Returns
         -------
@@ -1154,10 +1149,7 @@ def _run_model_on_fold(
             information later on (in ``obtain_arff_trace``).
         """
 
-        def _prediction_to_probabilities(
-                y: np.ndarray,
-                classes: List,
-        ) -> np.ndarray:
+        def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarray:
             """Transforms predicted probabilities to match with OpenML class indices.
 
             Parameters
@@ -1259,6 +1251,9 @@ def _prediction_to_probabilities(
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
 
+            if classes is None:
+                raise TypeError("Argument classes must not be of type 'None'")
+
             try:
                 proba_y = model_copy.predict_proba(X_test)
             except AttributeError:
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -26,7 +26,7 @@
 
 from sklearn.naive_bayes import GaussianNB
 from sklearn.model_selection._search import BaseSearchCV
-from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.preprocessing.imputation import Imputer
 from sklearn.dummy import DummyClassifier
 from sklearn.preprocessing import StandardScaler
@@ -38,7 +38,6 @@
 from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, \
     StratifiedKFold
 from sklearn.pipeline import Pipeline
-from sklearn.cluster import KMeans
 
 
 class TestRun(TestBase):
@@ -484,11 +483,6 @@ def test_run_and_upload_logistic_regression(self):
         self._run_and_upload_classification(lr, task_id, n_missing_vals,
                                             n_test_obs, '62501')
 
-    def test_run_and_upload_kmeans(self):
-        kmeans = KMeans()
-        task_id = 126034
-
-
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION[0]