FIX read and write files with utf-8 encoding

amueller · amueller · commit 244c585269fd · 2016-08-31T12:11:15.000-04:00
# Conflicts:
#	openml/datasets/dataset.py
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -1,3 +1,4 @@
+import io
 import os
 import requests
 import arff
@@ -60,7 +61,7 @@ def _read_url_files(url, file_dictionary=None, file_elements=None):
                     if key is 'dataset':
                         # check if arff is valid?
                         decoder = arff.ArffDecoder()
-                        with open(path) as fh:
+                        with io.open(path, encoding='utf8') as fh:
                             decoder.decode(fh, encode_nominal=True)
                 except:
                     raise ValueError("The file you have provided is not a valid arff file")
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -1,4 +1,5 @@
 import gzip
+import io
 import logging
 import os
 import sys
@@ -142,7 +143,7 @@ def decode_arff(fh):
             with gzip.open(filename) as fh:
                 return decode_arff(fh)
         else:
-            with open(filename) as fh:
+            with io.open(filename, encoding='utf8') as fh:
                 return decode_arff(fh)
 
     def get_data(self, target=None, target_dtype=int, include_row_id=False,
@@ -244,7 +245,8 @@ def _retrieve_class_labels(self):
         # TODO improve performance, currently reads the whole file
         # Should make a method that only reads the attributes
         arffFileName = self.data_file
-        with open(arffFileName) as fh:
+
+        with io.open(arffFileName, encoding='utf8') as fh:
             arffData = arff.ArffDecoder().decode(fh)
 
         dataAttributes = dict(arffData['attributes'])
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1,3 +1,4 @@
+import io
 import os
 import re
 import shutil
@@ -88,7 +89,7 @@ def _get_cached_dataset_description(did):
         did_cache_dir = os.path.join(cache_dir, "datasets", str(did))
         description_file = os.path.join(did_cache_dir, "description.xml")
         try:
-            with open(description_file) as fh:
+            with io.open(description_file, encoding='utf8') as fh:
                 dataset_xml = fh.read()
         except (IOError, OSError):
             continue
@@ -106,7 +107,7 @@ def _get_cached_dataset_arff(did):
         output_file = os.path.join(did_cache_dir, "dataset.arff")
 
         try:
-            with open(output_file):
+            with io.open(output_file, encoding='utf8'):
                 pass
             return output_file
         except (OSError, IOError):
@@ -298,13 +299,13 @@ def _get_dataset_description(did_cache_dir, did):
         return_code, dataset_xml = _perform_api_call(
             "data/%d" % did)
 
-        with open(description_file, "w") as fh:
+        with io.open(description_file, "w", encoding='utf8') as fh:
             fh.write(dataset_xml)
 
     description = xmltodict.parse(dataset_xml)[
         "oml:data_set_description"]
 
-    with open(description_file, "w") as fh:
+    with io.open(description_file, "w", encoding='utf8') as fh:
         fh.write(dataset_xml)
 
     return description
@@ -337,7 +338,7 @@ def _get_dataset_arff(did_cache_dir, description):
     # This means the file is still there; whether it is useful is up to
     # the user and not checked by the program.
     try:
-        with open(output_file_path):
+        with io.open(output_file_path, encoding='utf8'):
             pass
         return output_file_path
     except (OSError, IOError):
@@ -346,7 +347,7 @@ def _get_dataset_arff(did_cache_dir, description):
     url = description['oml:url']
     return_code, arff_string = _read_url(url)
 
-    with open(output_file_path, "w") as fh:
+    with io.open(output_file_path, "w", encoding='utf8') as fh:
         fh.write(arff_string)
     del arff_string
 
@@ -376,13 +377,13 @@ def _get_dataset_features(did_cache_dir, did):
 
     # Dataset features aren't subject to change...
     try:
-        with open(features_file) as fh:
+        with io.open(features_file, encoding='utf8') as fh:
             features_xml = fh.read()
     except (OSError, IOError):
         return_code, features_xml = _perform_api_call(
             "data/features/%d" % did)
 
-        with open(features_file, "w") as fh:
+        with io.open(features_file, "w", encoding='utf8') as fh:
             fh.write(features_xml)
 
     features = xmltodict.parse(features_xml)["oml:data_features"]
@@ -411,13 +412,13 @@ def _get_dataset_qualities(did_cache_dir, did):
     # Dataset qualities are subject to change and must be fetched every time
     qualities_file = os.path.join(did_cache_dir, "qualities.xml")
     try:
-        with open(qualities_file) as fh:
+        with io.open(qualities_file, encoding='utf8') as fh:
             qualities_xml = fh.read()
     except (OSError, IOError):
         return_code, qualities_xml = _perform_api_call(
             "data/qualities/%d" % did)
 
-        with open(qualities_file, "w") as fh:
+        with io.open(qualities_file, "w", encoding='utf8') as fh:
             fh.write(qualities_xml)
 
     qualities = xmltodict.parse(qualities_xml)['oml:data_qualities']
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -1,3 +1,4 @@
+import io
 import time
 import arff
 import xmltodict
@@ -302,7 +303,7 @@ def get_run(run_id):
             print(e)
             raise e
 
-        with open(run_file, "w") as fh:
+        with io.open(run_file, "w", encoding='utf8') as fh:
             fh.write(run_xml)
 
     try:
@@ -312,7 +313,7 @@ def get_run(run_id):
         print("Run ID", run_id)
         raise e
 
-    with open(run_file, "w") as fh:
+    with io.open(run_file, "w", encoding='utf8') as fh:
         fh.write(run_xml)
 
     return run
@@ -405,7 +406,7 @@ def _get_cached_run(run_id):
         try:
             run_file = os.path.join(run_cache_dir,
                                     "run_%d.xml" % int(run_id))
-            with open(run_file) as fh:
+            with io.open(run_file, encoding='utf8') as fh:
                 run = _create_task_from_xml(xml=fh.read())
             return run
 
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -1,3 +1,4 @@
+import io
 import os
 import re
 from collections import OrderedDict
@@ -38,7 +39,7 @@ def _get_cached_task(tid):
         task_file = os.path.join(task_cache_dir, str(tid), "task.xml")
 
         try:
-            with open(task_file) as fh:
+            with io.open(task_file, encoding='utf8') as fh:
                 task = _create_task_from_xml(xml=fh.read())
             return task
         except (OSError, IOError):
@@ -213,7 +214,7 @@ def get_task(task_id):
                             "task.xml")
 
     try:
-        with open(xml_file) as fh:
+        with io.open(xml_file, encoding='utf8') as fh:
             task = _create_task_from_xml(fh.read())
     except (OSError, IOError):
 
@@ -224,7 +225,7 @@ def get_task(task_id):
             print(e)
             raise e
 
-        with open(xml_file, "w") as fh:
+        with io.open(xml_file, "w", encoding='utf8') as fh:
             fh.write(task_xml)
 
         task = _create_task_from_xml(task_xml)
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -1,3 +1,4 @@
+import io
 import os
 
 from .. import config
@@ -65,7 +66,7 @@ def iterate_all_splits(self):
 
     def _download_split(self, cache_file):
         try:
-            with open(cache_file):
+            with io.open(cache_file, encoding='utf8'):
                 pass
         except (OSError, IOError):
             split_url = self.estimation_procedure["data_splits_url"]
@@ -75,7 +76,7 @@ def _download_split(self, cache_file):
                 print(e, split_url)
                 raise e
 
-            with open(cache_file, "w") as fh:
+            with io.open(cache_file, "w", encoding='utf8') as fh:
                 fh.write(split_arff)
             del split_arff