FIX allow to load sparse data

mfeurer · mfeurer · commit d905264b5efb · 2016-08-31T18:30:22.000+02:00
Conflicts:
	openml/datasets/dataset.py
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -76,7 +76,7 @@ def __init__(self, id=None, name=None, version=None, description=None,
                 logger.debug("Data pickle file already exists.")
             else:
                 try:
-                    data = self._get_arff()
+                    data = self._get_arff(self.format)
                 except OSError as e:
                     logger.critical("Please check that the data file %s is there "
                                     "and can be read.", self.data_file)
@@ -111,7 +111,7 @@ def __eq__(self, other):
         else:
             return False
 
-    def _get_arff(self):
+    def _get_arff(self, format):
         """Read ARFF file and return decoded arff.
 
         Reads the file referenced in self.data_file.
@@ -135,9 +135,17 @@ def _get_arff(self):
         if bits != 64 and os.path.getsize(filename) > 120000000:
             return NotImplementedError("File too big")
 
+        if format.lower() == 'arff':
+            return_type = arff.DENSE
+        elif format.lower() == 'sparse_arff':
+            return_type = arff.COO
+        else:
+            raise ValueError('Unknown data format %s' % format)
+
         def decode_arff(fh):
             decoder = arff.ArffDecoder()
-            return decoder.decode(fh, encode_nominal=True)
+            return decoder.decode(fh, encode_nominal=True,
+                                  return_type=return_type)
 
         if filename[-3:] == ".gz":
             with gzip.open(filename) as fh:
@@ -246,8 +254,15 @@ def _retrieve_class_labels(self):
         # Should make a method that only reads the attributes
         arffFileName = self.data_file
 
+        if self.format.lower() == 'arff':
+            return_type = arff.DENSE
+        elif self.format.lower() == 'sparse_arff':
+            return_type = arff.COO
+        else:
+            raise ValueError('Unknown data format %s' % self.format)
+
         with io.open(arffFileName, encoding='utf8') as fh:
-            arffData = arff.ArffDecoder().decode(fh)
+            arffData = arff.ArffDecoder().decode(fh, return_type=return_type)
 
         dataAttributes = dict(arffData['attributes'])
         if('class' in dataAttributes):
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -1,13 +1,14 @@
 import unittest
 import os
-import shutil
 import sys
 
 if sys.version_info[0] >= 3:
     from unittest import mock
 else:
     import mock
 
+import scipy.sparse
+
 import openml
 from openml import OpenMLDataset
 from openml.exceptions import OpenMLCacheException
@@ -141,6 +142,11 @@ def test_get_dataset(self):
         self.assertTrue(os.path.exists(os.path.join(
             openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
 
+    def test_get_dataset_sparse(self):
+        dataset = openml.datasets.get_dataset(1571)
+        X = dataset.get_data()
+        self.assertIsInstance(X, scipy.sparse.csr_matrix)
+
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 164