Removing dependency on scipy.io.arff (#693)

Neeratyoy · mfeurer · commit 4257c4824d2a · 2019-05-28T13:49:37.000+02:00
* Removing dependency on scipy arff

* Cleaning code

* Loading arff as generator object

* Removing redundant decode

* PEP8
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
@@ -3,7 +3,7 @@
 import pickle
 
 import numpy as np
-import scipy.io.arff
+import arff
 
 
 Split = namedtuple("Split", ["train", "test"])
@@ -77,20 +77,22 @@ def _from_arff_file(cls, filename: str) -> 'OpenMLSplit':
                 raise FileNotFoundError(
                     'Split arff %s does not exist!' % filename
                 )
-            splits, meta = scipy.io.arff.loadarff(filename)
-            name = meta.name
+            file_data = arff.load(open(filename), return_type=arff.DENSE_GEN)
+            splits = file_data['data']
+            name = file_data['relation']
+            attrnames = [attr[0] for attr in file_data['attributes']]
 
             repetitions = OrderedDict()
 
-            type_idx = meta._attrnames.index('type')
-            rowid_idx = meta._attrnames.index('rowid')
-            repeat_idx = meta._attrnames.index('repeat')
-            fold_idx = meta._attrnames.index('fold')
+            type_idx = attrnames.index('type')
+            rowid_idx = attrnames.index('rowid')
+            repeat_idx = attrnames.index('repeat')
+            fold_idx = attrnames.index('fold')
             sample_idx = (
-                meta._attrnames.index('sample')
-                if 'sample' in meta._attrnames
+                attrnames.index('sample')
+                if 'sample' in attrnames
                 else None
-            )  # can be None
+            )
 
             for line in splits:
                 # A line looks like type, rowid, repeat, fold
@@ -108,7 +110,7 @@ def _from_arff_file(cls, filename: str) -> 'OpenMLSplit':
                     repetitions[repetition][fold][sample] = ([], [])
                 split = repetitions[repetition][fold][sample]
 
-                type_ = line[type_idx].decode('utf-8')
+                type_ = line[type_idx]
                 if type_ == 'TRAIN':
                     split[0].append(line[rowid_idx])
                 elif type_ == 'TEST':