openml
diff --git a/‎.travis.yml‎
Lines changed: 25 additions & 20 deletions b/‎.travis.yml‎
Lines changed: 25 additions & 20 deletions
diff --git a/‎openml/apiconnector.py‎
Lines changed: 70 additions & 21 deletions b/‎openml/apiconnector.py‎
Lines changed: 70 additions & 21 deletions
@@ -1,4 +1,3 @@
-
 language: python
 python:
   - "2.7"
@@ -7,29 +6,35 @@ python:
 env:
   secure: "KTU56Bhft39FhFnV80Ek+Ht8nwAAJWlLAN104bALBzQWVraoD/znx0gQnoS+YQDjzxgpj30UKBua/o8q1IrvkjxJb8yUBzpS0P1jcGwqmpVRoNdb3pQPk8R7fB9pTFiaJUQbdQJ2/xTrB/T9Kda0J1zq81LC1zSOxAxUL47UI50="
 
-before_install:
-  - sudo apt-get install -q libatlas3gf-base libatlas-dev liblapack-dev gfortran
+cache:
+  directories:
+    - $HOME/.cache/pip
+  pip: true
 
-install:
-  - sudo apt-get update
-  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-      wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
-    else
-      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
-    fi
+sudo: false
+
+before_cache:
+  - rm -f $HOME/.cache/pip/log/debug.log
+
+# command to install dependencies
+addons:
+  apt:
+    packages:
+    - libatlas-dev
+    - liblapack-dev
+    - libatlas-base-dev
+    - gfortran
+
+before_install:
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
   - bash miniconda.sh -b -p $HOME/miniconda
   - export PATH="$HOME/miniconda/bin:$PATH"
-  - hash -r
-  - conda config --set always_yes yes --set changeps1 no
-  - conda update -q conda
-  # Useful for debugging any issues with conda
-  - conda info -a
-
-  # Replace dep1 dep2 ... with your dependencies
-  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy==0.13.3 matplotlib setuptools nose
-  - source activate test-environment
+  - conda update --yes conda
+
+install:
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy=1.9 scipy=0.16 nose pandas
+  - pip install mock
   - python setup.py install
-  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then pip install mock; fi
 
 # command to run tests, e.g. python setup.py test
 script:  python setup.py test
@@ -258,7 +258,6 @@ def _get_cached_dataset_description(self, did):
                                   self._private_directory_datasets]:
             did_cache_dir = os.path.join(dataset_cache_dir, str(did))
             description_file = os.path.join(did_cache_dir, "description.xml")
-
             try:
                 with open(description_file) as fh:
                     dataset_xml = fh.read()
@@ -735,7 +734,12 @@ def download_task(self, task_id):
             task = self._create_task_from_xml(task_xml)
 
         self.download_split(task)
-        self.download_dataset(task.dataset_id)
+        dataset = self.download_dataset(task.dataset_id)
+
+        # TODO look into either adding the class labels to task xml, or other
+        # way of reading it.
+        class_labels = self.retrieve_class_labels_for_dataset(dataset)
+        task.class_labels = class_labels
         return task
 
     def _create_task_from_xml(self, xml):
@@ -882,19 +886,17 @@ def _read_url(self, url, data=None, file_dictionary=None):
             connection = urlopen(url, data=data)
             return_code = connection.getcode()
             content_type = connection.info()['Content-Type']
-            # TODO maybe switch on the unicode flag!
             match = re.search(r'text/([\w-]*)(; charset=([\w-]*))?', content_type)
             if match:
                 if match.groups()[2] is not None:
                     encoding = match.group(3)
                 else:
-                    encoding = "ascii"
+                    encoding = "utf8"
             else:
                 # TODO ask JAN why this happens
                 logger.warn("Data from %s has content type %s; going to treat "
                             "this as ascii." % (url, content_type))
-                encoding = "ascii"
-
+                encoding = "utf8"
             tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
             with tmp as fh:
                 while True:
@@ -928,30 +930,77 @@ def upload_dataset(self, description, file_path=None):
             raise e
         return return_code, dataset_xml
 
-    def upload_flow(self, description, file_path=None):
+    def upload_flow(self, description, source_file_path=None):
+        """
+        The 'description' is binary data of an XML file according to the XSD Schema (OUTDATED!):
+        https://github.com/openml/website/blob/master/openml_OS/views/pages/rest_api/xsd/openml.implementation.upload.xsd
+
+        (optional) file_path is the absolute path to the file that is the flow (eg. a script)
+        """
         try:
             data = {'description': description}
-            return_code, dataset_xml = self._perform_api_call("/flow/", data=data, file_dictionary={'source': file_path})
+            file_dictionary = None
+
+            if(source_file_path != None):
+                file_dictionary={'source': source_file_path}
+
+            return_code, dataset_xml = self._perform_api_call("/flow/", data=data, file_dictionary=file_dictionary)
 
         except URLError as e:
             # TODO logger.debug
             print(e)
             raise e
         return return_code, dataset_xml
 
-    def upload_run(self, files):
-        file_dictionary = {}
-        if 'predictions' in files:
-            try:
-                for key, value in files.items():
-                    file_dictionary[key] = value
+    def upload_run(self, prediction_file_path, description_path):
+        try:
+            file_dictionary = {'predictions': prediction_file_path, 'description': description_path}
+            return_code, dataset_xml = self._perform_api_call("/run/", file_dictionary=file_dictionary)
+
+        except URLError as e:
+        # TODO logger.debug
+            print(e)
+            raise e
+        return return_code, dataset_xml
 
-                return_code, dataset_xml = self._perform_api_call("/run/", file_dictionary=file_dictionary)
+    def check_flow_exists(self, name, version):
+        """
+        Retrieves the flow id of the flow uniquely identified by name+version.
+        Returns flow id if such a flow exists, 
+        returns -1 if flow does not exists,
+        returns -2 if there was not a well-formed response from the server
+        http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
+        """
+        # Perhaps returns the -1/-2 business with proper raising of exceptions?
 
-            except URLError as e:
-                # TODO logger.debug
-                print(e)
-                raise e
-            return return_code, dataset_xml
+        if not (type(name) is str and len(name) > 0):
+            raise ValueError('Parameter \'name\' should be a non-empty string')
+        if not (type(version) is str and len(version) > 0):
+            raise ValueError('Parameter \'version\' should be a non-empty string')
+
+        try:
+            return_code, xml_response = self._perform_api_call("/flow/exists/%s/%s" % (name, version))
+            flow_id = -2
+            if return_code == 200:
+                xml_dict = xmltodict.parse(xml_response)
+                flow_id = xml_dict['oml:flow_exists']['oml:id']
+        except URLError as e:
+            # TODO logger.debug
+            print(e)
+            raise e
+        return return_code, xml_response, flow_id
+
+    def retrieve_class_labels_for_dataset(self, dataset):
+        """Reads the datasets arff to determine the class-labels, and returns those.
+        If the task has no class labels (for example a regression problem) it returns None."""
+        # TODO improve performance, currently reads the whole file
+        # Should make a method that only reads the attributes
+        arffFileName = dataset.data_file
+        with open(arffFileName) as fh:
+            arffData = arff.ArffDecoder().decode(fh)
+
+        dataAttributes = dict(arffData['attributes'])
+        if('class' in dataAttributes):
+            return dataAttributes['class']
         else:
-            raise ValueError("prediction files doesn't exist")
+            return None