Skip to content

Commit 244c585

Browse files
committed
FIX read and write files with utf-8 encoding
# Conflicts: # openml/datasets/dataset.py
1 parent a5b6cfb commit 244c585

6 files changed

Lines changed: 28 additions & 21 deletions

File tree

openml/_api_calls.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import os
23
import requests
34
import arff
@@ -60,7 +61,7 @@ def _read_url_files(url, file_dictionary=None, file_elements=None):
6061
if key is 'dataset':
6162
# check if arff is valid?
6263
decoder = arff.ArffDecoder()
63-
with open(path) as fh:
64+
with io.open(path, encoding='utf8') as fh:
6465
decoder.decode(fh, encode_nominal=True)
6566
except:
6667
raise ValueError("The file you have provided is not a valid arff file")

openml/datasets/dataset.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import gzip
2+
import io
23
import logging
34
import os
45
import sys
@@ -142,7 +143,7 @@ def decode_arff(fh):
142143
with gzip.open(filename) as fh:
143144
return decode_arff(fh)
144145
else:
145-
with open(filename) as fh:
146+
with io.open(filename, encoding='utf8') as fh:
146147
return decode_arff(fh)
147148

148149
def get_data(self, target=None, target_dtype=int, include_row_id=False,
@@ -244,7 +245,8 @@ def _retrieve_class_labels(self):
244245
# TODO improve performance, currently reads the whole file
245246
# Should make a method that only reads the attributes
246247
arffFileName = self.data_file
247-
with open(arffFileName) as fh:
248+
249+
with io.open(arffFileName, encoding='utf8') as fh:
248250
arffData = arff.ArffDecoder().decode(fh)
249251

250252
dataAttributes = dict(arffData['attributes'])

openml/datasets/functions.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import os
23
import re
34
import shutil
@@ -88,7 +89,7 @@ def _get_cached_dataset_description(did):
8889
did_cache_dir = os.path.join(cache_dir, "datasets", str(did))
8990
description_file = os.path.join(did_cache_dir, "description.xml")
9091
try:
91-
with open(description_file) as fh:
92+
with io.open(description_file, encoding='utf8') as fh:
9293
dataset_xml = fh.read()
9394
except (IOError, OSError):
9495
continue
@@ -106,7 +107,7 @@ def _get_cached_dataset_arff(did):
106107
output_file = os.path.join(did_cache_dir, "dataset.arff")
107108

108109
try:
109-
with open(output_file):
110+
with io.open(output_file, encoding='utf8'):
110111
pass
111112
return output_file
112113
except (OSError, IOError):
@@ -298,13 +299,13 @@ def _get_dataset_description(did_cache_dir, did):
298299
return_code, dataset_xml = _perform_api_call(
299300
"data/%d" % did)
300301

301-
with open(description_file, "w") as fh:
302+
with io.open(description_file, "w", encoding='utf8') as fh:
302303
fh.write(dataset_xml)
303304

304305
description = xmltodict.parse(dataset_xml)[
305306
"oml:data_set_description"]
306307

307-
with open(description_file, "w") as fh:
308+
with io.open(description_file, "w", encoding='utf8') as fh:
308309
fh.write(dataset_xml)
309310

310311
return description
@@ -337,7 +338,7 @@ def _get_dataset_arff(did_cache_dir, description):
337338
# This means the file is still there; whether it is useful is up to
338339
# the user and not checked by the program.
339340
try:
340-
with open(output_file_path):
341+
with io.open(output_file_path, encoding='utf8'):
341342
pass
342343
return output_file_path
343344
except (OSError, IOError):
@@ -346,7 +347,7 @@ def _get_dataset_arff(did_cache_dir, description):
346347
url = description['oml:url']
347348
return_code, arff_string = _read_url(url)
348349

349-
with open(output_file_path, "w") as fh:
350+
with io.open(output_file_path, "w", encoding='utf8') as fh:
350351
fh.write(arff_string)
351352
del arff_string
352353

@@ -376,13 +377,13 @@ def _get_dataset_features(did_cache_dir, did):
376377

377378
# Dataset features aren't subject to change...
378379
try:
379-
with open(features_file) as fh:
380+
with io.open(features_file, encoding='utf8') as fh:
380381
features_xml = fh.read()
381382
except (OSError, IOError):
382383
return_code, features_xml = _perform_api_call(
383384
"data/features/%d" % did)
384385

385-
with open(features_file, "w") as fh:
386+
with io.open(features_file, "w", encoding='utf8') as fh:
386387
fh.write(features_xml)
387388

388389
features = xmltodict.parse(features_xml)["oml:data_features"]
@@ -411,13 +412,13 @@ def _get_dataset_qualities(did_cache_dir, did):
411412
# Dataset qualities are subject to change and must be fetched every time
412413
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
413414
try:
414-
with open(qualities_file) as fh:
415+
with io.open(qualities_file, encoding='utf8') as fh:
415416
qualities_xml = fh.read()
416417
except (OSError, IOError):
417418
return_code, qualities_xml = _perform_api_call(
418419
"data/qualities/%d" % did)
419420

420-
with open(qualities_file, "w") as fh:
421+
with io.open(qualities_file, "w", encoding='utf8') as fh:
421422
fh.write(qualities_xml)
422423

423424
qualities = xmltodict.parse(qualities_xml)['oml:data_qualities']

openml/runs/run.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import time
23
import arff
34
import xmltodict
@@ -302,7 +303,7 @@ def get_run(run_id):
302303
print(e)
303304
raise e
304305

305-
with open(run_file, "w") as fh:
306+
with io.open(run_file, "w", encoding='utf8') as fh:
306307
fh.write(run_xml)
307308

308309
try:
@@ -312,7 +313,7 @@ def get_run(run_id):
312313
print("Run ID", run_id)
313314
raise e
314315

315-
with open(run_file, "w") as fh:
316+
with io.open(run_file, "w", encoding='utf8') as fh:
316317
fh.write(run_xml)
317318

318319
return run
@@ -405,7 +406,7 @@ def _get_cached_run(run_id):
405406
try:
406407
run_file = os.path.join(run_cache_dir,
407408
"run_%d.xml" % int(run_id))
408-
with open(run_file) as fh:
409+
with io.open(run_file, encoding='utf8') as fh:
409410
run = _create_task_from_xml(xml=fh.read())
410411
return run
411412

openml/tasks/functions.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import os
23
import re
34
from collections import OrderedDict
@@ -38,7 +39,7 @@ def _get_cached_task(tid):
3839
task_file = os.path.join(task_cache_dir, str(tid), "task.xml")
3940

4041
try:
41-
with open(task_file) as fh:
42+
with io.open(task_file, encoding='utf8') as fh:
4243
task = _create_task_from_xml(xml=fh.read())
4344
return task
4445
except (OSError, IOError):
@@ -213,7 +214,7 @@ def get_task(task_id):
213214
"task.xml")
214215

215216
try:
216-
with open(xml_file) as fh:
217+
with io.open(xml_file, encoding='utf8') as fh:
217218
task = _create_task_from_xml(fh.read())
218219
except (OSError, IOError):
219220

@@ -224,7 +225,7 @@ def get_task(task_id):
224225
print(e)
225226
raise e
226227

227-
with open(xml_file, "w") as fh:
228+
with io.open(xml_file, "w", encoding='utf8') as fh:
228229
fh.write(task_xml)
229230

230231
task = _create_task_from_xml(task_xml)

openml/tasks/task.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import os
23

34
from .. import config
@@ -65,7 +66,7 @@ def iterate_all_splits(self):
6566

6667
def _download_split(self, cache_file):
6768
try:
68-
with open(cache_file):
69+
with io.open(cache_file, encoding='utf8'):
6970
pass
7071
except (OSError, IOError):
7172
split_url = self.estimation_procedure["data_splits_url"]
@@ -75,7 +76,7 @@ def _download_split(self, cache_file):
7576
print(e, split_url)
7677
raise e
7778

78-
with open(cache_file, "w") as fh:
79+
with io.open(cache_file, "w", encoding='utf8') as fh:
7980
fh.write(split_arff)
8081
del split_arff
8182

0 commit comments

Comments
 (0)