FIX #197, do not automatically cast target attribute

mfeurer · mfeurer · commit 886a2175138f · 2017-10-12T11:45:10.000+02:00
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -184,7 +184,7 @@ def decode_arff(fh):
             with io.open(filename, encoding='utf8') as fh:
                 return decode_arff(fh)
 
-    def get_data(self, target=None, target_dtype=int, include_row_id=False,
+    def get_data(self, target=None, target_dtype=None, include_row_id=False,
                  include_ignore_attributes=False,
                  return_categorical_indicator=False,
                  return_attribute_names=False):
@@ -242,6 +242,12 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
         else:
             if isinstance(target, six.string_types):
                 target = [target]
+            legal_target_types = (int, float)
+            if target_dtype not in legal_target_types:
+                raise ValueError(
+                    "%s is not a legal target type. Legal target types are %s" %
+                    (target_dtype, legal_target_types)
+                )
             targets = np.array([True if column in target else False
                                 for column in attribute_names])
 
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -36,16 +36,20 @@ def get_dataset(self):
         return datasets.get_dataset(self.dataset_id)
 
     def get_X_and_y(self):
+        """Get data associated with the current task.
+        
+        Returns
+        -------
+        tuple - X and y
+
+        """
         dataset = self.get_dataset()
         # Replace with retrieve from cache
-        if self.task_type_id == 1:
-        # if 'Supervised Classification'.lower() in self.task_type.lower():
+        if self.task_type_id == 1:  # Supervised classification
             target_dtype = int
-        # elif 'Supervised Regression'.lower() in self.task_type.lower():
-        elif self.task_type_id == 2:
+        elif self.task_type_id == 2:  # Supervised regression
             target_dtype = float
-        # elif ''.lower('Learning Curve') in self.task_type.lower():
-        elif self.task_type_id == 3:
+        elif self.task_type_id == 3:  # Learning curves task for classification
             target_dtype = int
         else:
             raise NotImplementedError(self.task_type)
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -47,27 +47,37 @@ def test_get_data_with_rowid(self):
         self.assertEqual(len(categorical), 38)
 
     def test_get_data_with_target(self):
-        X, y = self.dataset.get_data(target="class")
+        X, y = self.dataset.get_data(target="class", target_dtype=int)
         self.assertIsInstance(X, np.ndarray)
         self.assertEqual(X.dtype, np.float32)
         self.assertIn(y.dtype, [np.int32, np.int64])
         self.assertEqual(X.shape, (898, 38))
         X, y, attribute_names = self.dataset.get_data(
-            target="class", return_attribute_names=True)
+            target="class",
+            target_dtype=int,
+            return_attribute_names=True
+        )
         self.assertEqual(len(attribute_names), 38)
         self.assertNotIn("class", attribute_names)
         self.assertEqual(y.shape, (898, ))
 
     def test_get_data_rowid_and_ignore_and_target(self):
         self.dataset.ignore_attributes = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
-        X, y = self.dataset.get_data(target="class", include_row_id=False,
-                                     include_ignore_attributes=False)
+        X, y = self.dataset.get_data(
+            target="class",
+            target_dtype=int,
+            include_row_id=False,
+            include_ignore_attributes=False
+        )
         self.assertEqual(X.dtype, np.float32)
         self.assertIn(y.dtype, [np.int32, np.int64])
         self.assertEqual(X.shape, (898, 36))
         X, y, categorical = self.dataset.get_data(
-            target="class", return_categorical_indicator=True)
+            target="class",
+            target_dtype=int,
+            return_categorical_indicator=True,
+        )
         self.assertEqual(len(categorical), 36)
         self.assertListEqual(categorical, [True] * 3 + [False] + [True] * 2 + [
             False] + [True] * 23 + [False] * 3 + [True] * 3)
@@ -100,14 +110,17 @@ def setUp(self):
         self.sparse_dataset = openml.datasets.get_dataset(4136)
 
     def test_get_sparse_dataset_with_target(self):
-        X, y = self.sparse_dataset.get_data(target="class")
+        X, y = self.sparse_dataset.get_data(target="class", target_dtype=int)
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(X.dtype, np.float32)
         self.assertIsInstance(y, np.ndarray)
         self.assertIn(y.dtype, [np.int32, np.int64])
         self.assertEqual(X.shape, (600, 20000))
         X, y, attribute_names = self.sparse_dataset.get_data(
-            target="class", return_attribute_names=True)
+            target="class",
+            target_dtype=int,
+            return_attribute_names=True,
+        )
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(len(attribute_names), 20000)
         self.assertNotIn("class", attribute_names)
@@ -170,14 +183,20 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         self.sparse_dataset.ignore_attributes = ["V256"]
         self.sparse_dataset.row_id_attribute = ["V512"]
         X, y = self.sparse_dataset.get_data(
-            target="class", include_row_id=False,
-            include_ignore_attributes=False)
+            target="class",
+            target_dtype=int,
+            include_row_id=False,
+            include_ignore_attributes=False,
+        )
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(X.dtype, np.float32)
         self.assertIn(y.dtype, [np.int32, np.int64])
         self.assertEqual(X.shape, (600, 19998))
         X, y, categorical = self.sparse_dataset.get_data(
-            target="class", return_categorical_indicator=True)
+            target="class",
+            target_dtype=int,
+            return_categorical_indicator=True,
+        )
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(len(categorical), 19998)
         self.assertListEqual(categorical, [False] * 19998)