Remove pandas as dataset container, use numpy instead

mfeurer · mfeurer · commit 91e88d533412 · 2015-04-01T16:04:08.000+02:00
diff --git a/openml/entities/dataset.py b/openml/entities/dataset.py
@@ -16,7 +16,6 @@
 logger = logging.getLogger(__name__)
 
 import numpy as np
-import pandas as pd
 
 from ..util import is_string
 
@@ -54,10 +53,10 @@ def __init__(self, id, name, version, description, format, creator,
         self.md5_cheksum = md5_checksum
         self.data_file = data_file
 
-        self.pandas_file = data_file.replace('.arff', '.pd')
+        self.data_pickle_file = data_file.replace('.arff', '.pkl')
 
-        if os.path.exists(self.pandas_file):
-            logger.debug("Pandas file already exists.")
+        if os.path.exists(self.data_pickle_file):
+            logger.debug("Data pickle file already exists.")
         else:
             try:
                 data = self.get_arff()
@@ -69,12 +68,13 @@ def __init__(self, id, name, version, description, format, creator,
             categorical = [False if type(type_) != list else True
                                 for name, type_ in data['attributes']]
             attribute_names = [name for name, type_ in data['attributes']]
-            X = pd.DataFrame(data=data['data'], columns=attribute_names)
+            # X = pd.DataFrame(data=data['data'], columns=attribute_names)
+            X = np.array(data['data'], dtype=np.float32)
 
-            with open(self.pandas_file, "w") as fh:
-                pickle.dump((X, categorical), fh, -1)
+            with open(self.data_pickle_file, "w") as fh:
+                pickle.dump((X, categorical, attribute_names), fh, -1)
             logger.debug("Saved dataset %d: %s to file %s" %
-                         (self.id, self.name, self.pandas_file))
+                         (self.id, self.name, self.data_pickle_file))
 
     def __eq__(self, other):
         if type(other) != OpenMLDataset:
@@ -113,16 +113,19 @@ def decode_arff(fh):
 
     ############################################################################
     # pandas related stuff...
-    def get_pandas(self, target=None, include_row_id=False,
-                   include_ignore_attributes=False):
+    def get_dataset(self, target=None, include_row_id=False,
+                   include_ignore_attributes=False,
+                   return_categorical_indicator=False,
+                   return_attribute_names=False):
+        rval = []
 
-        path = self.pandas_file
+        path = self.data_pickle_file
         if not os.path.exists(path):
             raise ValueError("Cannot find a ndarray file for dataset %s at"
                              "location %s " % (self.name, path))
         else:
             with open(path) as fh:
-                data, categorical = pickle.load(fh)
+                data, categorical, attribute_names = pickle.load(fh)
 
         to_exclude = []
         if include_row_id == False:
@@ -143,33 +146,48 @@ def get_pandas(self, target=None, include_row_id=False,
                 else:
                     to_exclude.extend(self.ignore_attributes)
 
-        logger.info("Going to remove the following row_id_attributes:"
-                    " %s" % self.row_id_attribute)
-        keep = [True if column not in to_exclude else False
-                 for column in data.columns]
-        data = data.loc[:,keep]
-        categorical = [cat for cat, k in zip(categorical, keep) if k]
+        if len(to_exclude) > 0:
+            logger.info("Going to remove the following row_id_attributes:"
+                        " %s" % self.row_id_attribute)
+            keep = np.array([True if column not in to_exclude else False
+                             for column in attribute_names])
+            data = data[:,keep]
+            categorical = [cat for cat, k in zip(categorical, keep) if k]
+            attribute_names = [att for att, k in
+                               zip(attribute_names, keep) if k]
 
         if target is None:
-            return data, categorical
+            rval.append(data)
         else:
             if is_string(target):
                 target = [target]
             targets = np.array([True if column in target else False
-                                for column in data.columns])
+                                for column in attribute_names])
 
             try:
-                x = data.loc[:,~targets]
-                y = data.loc[:,targets]
+                x = data[:,~targets]
+                y = data[:,targets].astype(np.int32)
 
-                # Convert to series if possible
                 if len(y.shape) == 2 and y.shape[1] == 1:
-                    y = y.iloc[:,0]
+                    y = y[:,0]
 
-                categorical = [cat for cat, t in zip(categorical, targets)
-                               if not t]
+                categorical = [cat for cat, t in
+                               zip(categorical, targets) if not t]
+                attribute_names = [att for att, k in
+                                   zip(attribute_names, targets) if not k]
             except KeyError as e:
                 import sys
                 sys.stdout.flush()
                 raise e
-            return x, y, categorical
+            rval.append(x)
+            rval.append(y)
+
+        if return_categorical_indicator:
+            rval.append(categorical)
+        if return_attribute_names:
+            rval.append(attribute_names)
+
+        if len(rval) == 1:
+            return rval[0]
+        else:
+            return rval
diff --git a/setup.py b/setup.py
@@ -14,7 +14,6 @@
                  install_requires=["liac-arff>=2.0.2",
                                    "numpy>1.6.2",
                                    "scipy>0.9",
-                                   "pandas>0.13.1",
                                    "xmltodict",
                                    "nose"],
                  test_suite="nose.collector",
diff --git a/tests/entities/test_dataset.py b/tests/entities/test_dataset.py
@@ -6,6 +6,7 @@
 import pandas as pd
 
 from openml.entities.dataset import OpenMLDataset
+from openml.util import is_string
 
 class OpenMLDatasetTest(unittest.TestCase):
     def setUp(self):
@@ -14,8 +15,8 @@ def setUp(self):
         self.directory = os.path.dirname(__file__)
         self.arff_filename = os.path.join(self.directory, "..",
             "files", "datasets", "2", "dataset.arff")
-        self.pandas_filename = os.path.join(self.directory, "..",
-            "files", "datasets", "2", "dataset.pd")
+        self.pickle_filename = os.path.join(self.directory, "..",
+            "files", "datasets", "2", "dataset.pkl")
         self.dataset = OpenMLDataset(1, "anneal", 1, "Lorem ipsum.",
                                      "arff", None, None, None,
                                      "2014-04-06 23:19:24", None, "Public",
@@ -26,7 +27,7 @@ def setUp(self):
                                      data_file=self.arff_filename)
 
     def tearDown(self):
-        for file_ in [self.pandas_filename]:
+        for file_ in [self.pickle_filename]:
             os.remove(file_)
 
     ############################################################################
@@ -40,80 +41,83 @@ def test_get_arff(self):
         self.assertTrue(hasattr(rval[1], '__dict__'))
         self.assertEqual(rval[0].shape, (898, ))
 
-    def test_get_pandas(self):
+    def test_get_dataset(self):
         # Basic usage
-        rval, categorical = self.dataset.get_pandas()
-        self.assertIsInstance(rval, pd.DataFrame)
-        self.assertEqual(rval.values.dtype, np.float64)
+        rval = self.dataset.get_dataset()
+        self.assertIsInstance(rval, np.ndarray)
+        self.assertEqual(rval.dtype, np.float32)
         self.assertEqual((898, 39), rval.shape)
+        rval, categorical = self.dataset.get_dataset(
+            return_categorical_indicator=True)
         self.assertEqual(len(categorical), 39)
+        self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
+        rval, attribute_names = self.dataset.get_dataset(
+            return_attribute_names=True)
+        self.assertEqual(len(attribute_names), 39)
+        self.assertTrue(all([is_string(att) for att in attribute_names]))
 
-    def test_get_pandas_with_target(self):
-        X, y, categorical = self.dataset.get_pandas(target="class")
-        self.assertEqual(X.values.dtype, np.float64)
-        self.assertEqual(y.values.dtype, np.int64)
+    def test_get_dataset_with_target(self):
+        X, y = self.dataset.get_dataset(target="class")
+        self.assertEqual(X.dtype, np.float32)
+        self.assertEqual(y.dtype, np.int32)
         self.assertEqual(X.shape, (898, 38))
-        self.assertEqual(len(categorical), 38)
-        self.assertNotIn("class", X)
+        X, y, attribute_names = self.dataset.get_dataset(
+            target="class", return_attribute_names=True)
+        self.assertEqual(len(attribute_names), 38)
+        self.assertNotIn("class", attribute_names)
         self.assertEqual(y.shape, (898, ))
-        self.assertEqual(y.name, "class")
 
-    def test_get_pandas_with_rowid(self):
+    def test_get_dataset_with_rowid(self):
         self.dataset.row_id_attribute = "condition"
-        rval, categorical = self.dataset.get_pandas(include_row_id=True)
-        self.assertEqual(rval.values.dtype, np.float64)
+        rval, categorical = self.dataset.get_dataset(
+            include_row_id=True, return_categorical_indicator=True)
+        self.assertEqual(rval.dtype, np.float32)
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
-        self.assertIn("condition", rval)
-        rval, categorical = self.dataset.get_pandas(include_row_id=False)
-        self.assertEqual(rval.values.dtype, np.float64)
+        rval, categorical = self.dataset.get_dataset(
+            include_row_id=False, return_categorical_indicator=True)
+        self.assertEqual(rval.dtype, np.float32)
         self.assertEqual(rval.shape, (898, 38))
         self.assertEqual(len(categorical), 38)
-        self.assertNotIn("condition", rval)
 
         # TODO this is not yet supported!
         #rowid = ["condition", "formability"]
         #self.dataset.row_id_attribute = rowid
         #rval = self.dataset.get_pandas(include_row_id=False)
 
-    def test_get_pandas_with_ignore_attributes(self):
+    def test_get_dataset_with_ignore_attributes(self):
         self.dataset.ignore_attributes = "condition"
-        rval, categorical = self.dataset.get_pandas(include_ignore_attributes=True)
-        self.assertEqual(rval.values.dtype, np.float64)
+        rval = self.dataset.get_dataset(include_ignore_attributes=True)
+        self.assertEqual(rval.dtype, np.float32)
         self.assertEqual(rval.shape, (898, 39))
+        rval, categorical = self.dataset.get_dataset(
+            include_ignore_attributes=True, return_categorical_indicator=True)
         self.assertEqual(len(categorical), 39)
-        self.assertIn("condition", rval)
-        rval, categorical = self.dataset.get_pandas(include_ignore_attributes=False)
-        self.assertEqual(rval.values.dtype, np.float64)
+        rval = self.dataset.get_dataset(include_ignore_attributes=False)
+        self.assertEqual(rval.dtype, np.float32)
         self.assertEqual(rval.shape, (898, 38))
+        rval, categorical = self.dataset.get_dataset(
+            include_ignore_attributes=False, return_categorical_indicator=True)
         self.assertEqual(len(categorical), 38)
-        self.assertNotIn("condition", rval)
         # TODO test multiple ignore attributes!
 
-    def test_get_pandas_rowid_and_ignore(self):
+    def test_get_dataset_rowid_and_ignore(self):
         self.dataset.ignore_attributes = "condition"
         self.dataset.row_id_attribute = "condition"
-        rval, categorical = self.dataset.get_pandas(include_ignore_attributes=False,
-                                       include_row_id=False)
-        self.assertEqual(rval.values.dtype, np.float64)
-        self.assertEqual(rval.shape, (898, 38))
-        self.assertEqual(len(categorical), 38)
-        self.dataset.ignore_attributes = "hardness"
-        rval, categorical = self.dataset.get_pandas(include_ignore_attributes=False,
-                                       include_row_id=False)
-        self.assertEqual(rval.values.dtype, np.float64)
-        self.assertEqual(rval.shape, (898, 37))
-        self.assertEqual(len(categorical), 37)
+        rval = self.dataset.get_dataset(include_ignore_attributes=False,
+                                        include_row_id=False)
+        self.assertEqual(rval.dtype, np.float32)
 
-    def test_get_pandas_rowid_and_ignore_and_target(self):
+    def test_get_dataset_rowid_and_ignore_and_target(self):
         self.dataset.ignore_attributes = "condition"
         self.dataset.row_id_attribute = "hardness"
-        X, y, categorical = self.dataset.get_pandas(target="class",
-                                                 include_row_id=False,
-                                       include_ignore_attributes=False)
-        self.assertEqual(X.values.dtype, np.float64)
-        self.assertEqual(y.values.dtype, np.int64)
+        X, y = self.dataset.get_dataset(target="class", include_row_id=False,
+                                        include_ignore_attributes=False)
+        self.assertEqual(X.dtype, np.float32)
+        self.assertEqual(y.dtype, np.int32)
         self.assertEqual(X.shape, (898, 36))
+        X, y , categorical = self.dataset.get_dataset(
+            target="class", return_categorical_indicator=True)
         self.assertEqual(len(categorical), 36)
         self.assertListEqual(categorical, [True]*3 + [False] + [True]*2 + [
             False] + [True]*23 + [False]*3 + [True]*3)
diff --git a/tests/files/datasplits.arff b/tests/files/datasplits.arff