openml
diff --git a/‎examples/OpenMLDemo.ipynb‎
Lines changed: 11 additions & 18 deletions b/‎examples/OpenMLDemo.ipynb‎
Lines changed: 11 additions & 18 deletions
diff --git a/‎openml/datasets/dataset.py‎
Lines changed: 50 additions & 19 deletions b/‎openml/datasets/dataset.py‎
Lines changed: 50 additions & 19 deletions
@@ -118,7 +118,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "First 10 of 2806 datasets...\n",
+      "First 10 of 2855 datasets...\n",
       "   did             name  NumberOfInstances  NumberOfFeatures\n",
       "0    1           anneal                898                39\n",
       "1    2           anneal                898                39\n",
@@ -274,6 +274,7 @@
       " 'creator': 'R.A. Fisher',\n",
       " 'data_file': '/home/andy/.openml/cache/datasets/61/dataset.arff',\n",
       " 'data_pickle_file': '/home/andy/.openml/cache/datasets/61/dataset.pkl',\n",
+      " 'dataset_id': 61,\n",
       " 'default_target_attribute': 'class',\n",
       " 'description': '**Author**: R.A. Fisher  \\n'\n",
       "                '**Source**: '\n",
@@ -304,7 +305,6 @@
       "                '       -- Iris Versicolour\\n'\n",
       "                '       -- Iris Virginica',\n",
       " 'format': 'ARFF',\n",
-      " 'id': 61,\n",
       " 'ignore_attributes': None,\n",
       " 'language': None,\n",
       " 'licence': 'Public',\n",
@@ -629,7 +629,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2823\n",
       "RandomForest has run on the task.\n"
      ]
     }
@@ -657,24 +656,18 @@
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Uploaded run with id 538241\n",
-      "Check it at www.openml.org/r/538241\n"
-     ]
+     "data": {
+      "text/plain": [
+       "<openml.runs.run.OpenMLRun at 0x7fb31ecec668>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "import xmltodict\n",
-    "\n",
-    "return_code, response = run.publish()\n",
-    "\n",
-    "if(return_code == 200):\n",
-    "    response_dict = xmltodict.parse(response)\n",
-    "    run_id = response_dict['oml:upload_run']['oml:run_id']\n",
-    "    print(\"Uploaded run with id %s\" % (run_id))\n",
-    "    print(\"Check it at www.openml.org/r/%s\" % (run_id))"
+    "run.publish()"
    ]
   },
   {
 
@@ -8,6 +8,7 @@
 
 import numpy as np
 import scipy.sparse
+import xmltodict
 
 if sys.version_info[0] >= 3:
     import pickle
@@ -17,6 +18,7 @@
     except:
         import pickle
 
+
 from ..util import is_string
 from .._api_calls import _perform_api_call
 
@@ -36,7 +38,7 @@ class OpenMLDataset(object):
         Description of the dataset
     FIXME : which of these do we actually nee?
     """
-    def __init__(self, id=None, name=None, version=None, description=None,
+    def __init__(self, dataset_id=None, name=None, version=None, description=None,
                  format=None, creator=None, contributor=None,
                  collection_date=None, upload_date=None, language=None,
                  licence=None, url=None, default_target_attribute=None,
@@ -45,7 +47,7 @@ def __init__(self, id=None, name=None, version=None, description=None,
                  original_data_url=None, paper_url=None, update_comment=None,
                  md5_checksum=None, data_file=None):
         # Attributes received by querying the RESTful API
-        self.id = int(id) if id is not None else None
+        self.dataset_id = int(dataset_id) if dataset_id is not None else None
         self.name = name
         self.version = int(version)
         self.description = description
@@ -76,7 +78,7 @@ def __init__(self, id=None, name=None, version=None, description=None,
                 logger.debug("Data pickle file already exists.")
             else:
                 try:
-                    data = self._get_arff()
+                    data = self._get_arff(self.format)
                 except OSError as e:
                     logger.critical("Please check that the data file %s is there "
                                     "and can be read.", self.data_file)
@@ -100,7 +102,7 @@ def __init__(self, id=None, name=None, version=None, description=None,
                 with open(self.data_pickle_file, "wb") as fh:
                     pickle.dump((X, categorical, attribute_names), fh, -1)
                 logger.debug("Saved dataset %d: %s to file %s" %
-                             (self.id, self.name, self.data_pickle_file))
+                             (self.dataset_id, self.name, self.data_pickle_file))
 
     def __eq__(self, other):
         if type(other) != OpenMLDataset:
@@ -111,7 +113,7 @@ def __eq__(self, other):
         else:
             return False
 
-    def _get_arff(self):
+    def _get_arff(self, format):
         """Read ARFF file and return decoded arff.
 
         Reads the file referenced in self.data_file.
@@ -135,9 +137,17 @@ def _get_arff(self):
         if bits != 64 and os.path.getsize(filename) > 120000000:
             return NotImplementedError("File too big")
 
+        if format.lower() == 'arff':
+            return_type = arff.DENSE
+        elif format.lower() == 'sparse_arff':
+            return_type = arff.COO
+        else:
+            raise ValueError('Unknown data format %s' % format)
+
         def decode_arff(fh):
             decoder = arff.ArffDecoder()
-            return decoder.decode(fh, encode_nominal=True)
+            return decoder.decode(fh, encode_nominal=True,
+                                  return_type=return_type)
 
         if filename[-3:] == ".gz":
             with gzip.open(filename) as fh:
@@ -190,8 +200,8 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
                     to_exclude.extend(self.ignore_attributes)
 
         if len(to_exclude) > 0:
-            logger.info("Going to remove the following row_id_attributes:"
-                        " %s" % self.row_id_attribute)
+            logger.info("Going to remove the following attributes:"
+                        " %s" % to_exclude)
             keep = np.array([True if column not in to_exclude else False
                              for column in attribute_names])
             data = data[:, keep]
@@ -239,21 +249,41 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
         else:
             return rval
 
-    def _retrieve_class_labels(self):
-        """Reads the datasets arff to determine the class-labels, and returns those.
-        If the task has no class labels (for example a regression problem) it returns None."""
+    def retrieve_class_labels(self, target_name='class'):
+        """Reads the datasets arff to determine the class-labels.
+
+        If the task has no class labels (for example a regression problem)
+        it returns None. Necessary because the data returned by get_data
+        only contains the indices of the classes, while OpenML needs the real
+        classname when uploading the results of a run.
+
+        Parameters
+        ----------
+        target_name : str
+            Name of the target attribute
+
+        Returns
+        -------
+        list
+        """
+
         # TODO improve performance, currently reads the whole file
         # Should make a method that only reads the attributes
         arffFileName = self.data_file
 
+        if self.format.lower() == 'arff':
+            return_type = arff.DENSE
+        elif self.format.lower() == 'sparse_arff':
+            return_type = arff.COO
+        else:
+            raise ValueError('Unknown data format %s' % self.format)
+
         with io.open(arffFileName, encoding='utf8') as fh:
-            arffData = arff.ArffDecoder().decode(fh)
+            arffData = arff.ArffDecoder().decode(fh, return_type=return_type)
 
         dataAttributes = dict(arffData['attributes'])
-        if('class' in dataAttributes):
-            return dataAttributes['class']
-        elif('Class' in dataAttributes):
-            return dataAttributes['Class']
+        if target_name in dataAttributes:
+            return dataAttributes[target_name]
         else:
             return None
 
@@ -281,7 +311,8 @@ def publish(self):
             "/data/", file_dictionary=file_dictionary,
             file_elements=file_elements)
 
-        return return_code, return_value
+        self.dataset_id = int(xmltodict.parse(return_value)['oml:upload_data_set']['oml:id'])
+        return self
 
     def _to_xml(self):
         """Serialize object to xml for upload
@@ -292,7 +323,7 @@ def _to_xml(self):
             XML description of the data.
         """
         xml_dataset = ('<oml:data_set_description '
-                       'xmlns:oml="http://openml.org/openml">')
+                       'xmlns:oml="http://openml.org/openml">\n')
         props = ['id', 'name', 'version', 'description', 'format', 'creator',
                  'contributor', 'collection_date', 'upload_date', 'language',
                  'licence', 'url', 'default_target_attribute',
@@ -302,6 +333,6 @@ def _to_xml(self):
         for prop in props:
             content = getattr(self, prop, None)
             if content is not None:
-                xml_dataset += "<oml:{0}>{1}</oml:{0}>".format(prop, content)
+                xml_dataset += "<oml:{0}>{1}</oml:{0}>\n".format(prop, content)
         xml_dataset += "</oml:data_set_description>"
         return xml_dataset