Skip to content

Commit 34d5a96

Browse files
committed
test md5 hash on dataset download
1 parent 7721835 commit 34d5a96

2 files changed

Lines changed: 25 additions & 0 deletions

File tree

openml/datasets/functions.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import OrderedDict
2+
import hashlib
23
import io
34
import os
45
import re
@@ -365,6 +366,8 @@ def _get_dataset_arff(did_cache_dir, description):
365366
Location of arff file.
366367
"""
367368
output_file_path = os.path.join(did_cache_dir, "dataset.arff")
369+
md5_checksum_fixture = description.get("oml:md5_checksum")
370+
did = description.get("oml:id")
368371

369372
# This means the file is still there; whether it is useful is up to
370373
# the user and not checked by the program.
@@ -377,6 +380,14 @@ def _get_dataset_arff(did_cache_dir, description):
377380

378381
url = description['oml:url']
379382
arff_string = _read_url(url)
383+
md5 = hashlib.md5()
384+
md5.update(arff_string.encode('utf8'))
385+
md5_checksum = md5.hexdigest()
386+
if md5_checksum != md5_checksum_fixture:
387+
raise ValueError(
388+
'Checksum %s of downloaded dataset %d is unequal to the checksum '
389+
'%s sent by the server.' % (md5_checksum, did, md5_checksum_fixture)
390+
)
380391

381392
with io.open(output_file_path, "w", encoding='utf8') as fh:
382393
fh.write(arff_string)

tests/test_datasets/test_dataset_functions.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,20 @@ def test__getarff_path_dataset_arff(self):
200200
self.assertIsInstance(arff_path, str)
201201
self.assertTrue(os.path.exists(arff_path))
202202

203+
def test__getarff_md5_issue(self):
204+
description = {
205+
'oml:id': 5,
206+
'oml:md5_checksum': 'abc',
207+
'oml:url': 'https://www.openml.org/data/download/61',
208+
}
209+
self.assertRaisesRegexp(
210+
ValueError,
211+
'Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded dataset 5 '
212+
'is unequal to the checksum abc sent by the server.',
213+
_get_dataset_arff,
214+
self.workdir, description,
215+
)
216+
203217
def test__get_dataset_features(self):
204218
features = _get_dataset_features(self.workdir, 2)
205219
self.assertIsInstance(features, dict)

0 commit comments

Comments
 (0)