remove argument which value can be inferred from data

mfeurer · mfeurer · commit c6f85b6d20ac · 2017-10-12T18:56:20.000+02:00
diff --git a/examples/OpenML_Tutorial.ipynb b/examples/OpenML_Tutorial.ipynb
@@ -841,7 +841,6 @@
    "source": [
     "X, y, attribute_names = dataset.get_data(\n",
     "    target=dataset.default_target_attribute,\n",
-    "    target_dtype=int,\n",
     "    return_attribute_names=True,\n",
     ")\n",
     "eeg = pd.DataFrame(X, columns=attribute_names)\n",
@@ -932,10 +931,7 @@
     "from sklearn import neighbors\n",
     "\n",
     "dataset = oml.datasets.get_dataset(1471)\n",
-    "X, y = dataset.get_data(\n",
-    "    target=dataset.default_target_attribute,\n",
-    "    target_dtype=int,\n",
-    ")\n",
+    "X, y = dataset.get_data(target=dataset.default_target_attribute)\n",
     "clf = neighbors.KNeighborsClassifier(n_neighbors=1)\n",
     "clf.fit(X, y)"
    ]
@@ -992,8 +988,8 @@
     "dataset = oml.datasets.get_dataset(10)\n",
     "X, y, categorical = dataset.get_data(\n",
     "    target=dataset.default_target_attribute,\n",
-    "    target_dtype=int,\n",
-    "    return_categorical_indicator=True)\n",
+    "    return_categorical_indicator=True,\n",
+    ")\n",
     "print(\"Categorical features: %s\" % categorical)\n",
     "enc = preprocessing.OneHotEncoder(categorical_features=categorical)\n",
     "X = enc.fit_transform(X)\n",
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -184,10 +184,12 @@ def decode_arff(fh):
             with io.open(filename, encoding='utf8') as fh:
                 return decode_arff(fh)
 
-    def get_data(self, target=None, target_dtype=None, include_row_id=False,
+    def get_data(self, target=None,
+                 include_row_id=False,
                  include_ignore_attributes=False,
                  return_categorical_indicator=False,
-                 return_attribute_names=False):
+                 return_attribute_names=False
+    ):
         """Returns dataset content as numpy arrays / sparse matrices.
 
         Parameters
@@ -225,7 +227,10 @@ def get_data(self, target=None, target_dtype=None, include_row_id=False,
             if not self.ignore_attributes:
                 pass
             else:
-                to_exclude.extend(self.ignore_attributes)
+                if isinstance(self.ignore_attributes, six.string_types):
+                    to_exclude.append(self.ignore_attributes)
+                else:
+                    to_exclude.extend(self.ignore_attributes)
 
         if len(to_exclude) > 0:
             logger.info("Going to remove the following attributes:"
@@ -242,14 +247,19 @@ def get_data(self, target=None, target_dtype=None, include_row_id=False,
         else:
             if isinstance(target, six.string_types):
                 target = [target]
-            legal_target_types = (int, float, np.float32, np.float64)
-            if target_dtype not in legal_target_types:
-                raise ValueError(
-                    "%s is not a legal target type. Legal target types are %s" %
-                    (target_dtype, legal_target_types)
-                )
             targets = np.array([True if column in target else False
                                 for column in attribute_names])
+            if np.sum(targets) > 1:
+                raise NotImplementedError(
+                    "Number of requested targets %d is not implemented." %
+                    np.sum(targets)
+                )
+            target_categorical = [
+                cat for cat, column in
+                six.moves.zip(categorical, attribute_names)
+                if column in target
+            ]
+            target_dtype = int if target_categorical[0] else float
 
             try:
                 x = data[:, ~targets]
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -44,17 +44,9 @@ def get_X_and_y(self):
 
         """
         dataset = self.get_dataset()
-        # Replace with retrieve from cache
-        if self.task_type_id == 1:  # Supervised classification
-            target_dtype = int
-        elif self.task_type_id == 2:  # Supervised regression
-            target_dtype = float
-        elif self.task_type_id == 3:  # Learning curves task for classification
-            target_dtype = int
-        else:
+        if self.task_type_id not in (1, 2, 3):
             raise NotImplementedError(self.task_type)
-        X_and_y = dataset.get_data(target=self.target_name,
-                                   target_dtype=target_dtype)
+        X_and_y = dataset.get_data(target=self.target_name)
         return X_and_y
 
     def get_train_test_split_indices(self, fold=0, repeat=0, sample=0):
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -47,14 +47,13 @@ def test_get_data_with_rowid(self):
         self.assertEqual(len(categorical), 38)
 
     def test_get_data_with_target(self):
-        X, y = self.dataset.get_data(target="class", target_dtype=int)
+        X, y = self.dataset.get_data(target="class")
         self.assertIsInstance(X, np.ndarray)
         self.assertEqual(X.dtype, np.float32)
         self.assertIn(y.dtype, [np.int32, np.int64])
         self.assertEqual(X.shape, (898, 38))
         X, y, attribute_names = self.dataset.get_data(
             target="class",
-            target_dtype=int,
             return_attribute_names=True
         )
         self.assertEqual(len(attribute_names), 38)
@@ -66,7 +65,6 @@ def test_get_data_rowid_and_ignore_and_target(self):
         self.dataset.row_id_attribute = ["hardness"]
         X, y = self.dataset.get_data(
             target="class",
-            target_dtype=int,
             include_row_id=False,
             include_ignore_attributes=False
         )
@@ -75,7 +73,6 @@ def test_get_data_rowid_and_ignore_and_target(self):
         self.assertEqual(X.shape, (898, 36))
         X, y, categorical = self.dataset.get_data(
             target="class",
-            target_dtype=int,
             return_categorical_indicator=True,
         )
         self.assertEqual(len(categorical), 36)
@@ -110,15 +107,14 @@ def setUp(self):
         self.sparse_dataset = openml.datasets.get_dataset(4136)
 
     def test_get_sparse_dataset_with_target(self):
-        X, y = self.sparse_dataset.get_data(target="class", target_dtype=int)
+        X, y = self.sparse_dataset.get_data(target="class")
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(X.dtype, np.float32)
         self.assertIsInstance(y, np.ndarray)
         self.assertIn(y.dtype, [np.int32, np.int64])
         self.assertEqual(X.shape, (600, 20000))
         X, y, attribute_names = self.sparse_dataset.get_data(
             target="class",
-            target_dtype=int,
             return_attribute_names=True,
         )
         self.assertTrue(sparse.issparse(X))
@@ -184,7 +180,6 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         self.sparse_dataset.row_id_attribute = ["V512"]
         X, y = self.sparse_dataset.get_data(
             target="class",
-            target_dtype=int,
             include_row_id=False,
             include_ignore_attributes=False,
         )
@@ -194,7 +189,6 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         self.assertEqual(X.shape, (600, 19998))
         X, y, categorical = self.sparse_dataset.get_data(
             target="class",
-            target_dtype=int,
             return_categorical_indicator=True,
         )
         self.assertTrue(sparse.issparse(X))