Skip to content

Commit ef3e4d1

Browse files
amuellermfeurer
authored andcommitted
add validation for strings in datasets (#822)
* add validation for strings in datasets * add tests, allow None * document where I got the regex from
1 parent b96c564 commit ef3e4d1

2 files changed

Lines changed: 27 additions & 2 deletions

File tree

openml/datasets/dataset.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import OrderedDict
2+
import re
23
import gzip
34
import io
45
import logging
@@ -108,7 +109,17 @@ def __init__(self, name, description, format=None,
108109
paper_url=None, update_comment=None,
109110
md5_checksum=None, data_file=None, features=None,
110111
qualities=None, dataset=None):
111-
112+
if description and not re.match("^[\x00-\x7F]*$", description):
113+
# not basiclatin (XSD complains)
114+
raise ValueError("Invalid symbols in description: {}".format(
115+
description))
116+
if citation and not re.match("^[\x00-\x7F]*$", citation):
117+
# not basiclatin (XSD complains)
118+
raise ValueError("Invalid symbols in citation: {}".format(
119+
citation))
120+
if not re.match("^[a-zA-Z0-9_\\-\\.\\(\\),]+$", name):
121+
# regex given by server in error message
122+
raise ValueError("Invalid symbols in name: {}".format(name))
112123
# TODO add function to check if the name is casual_string128
113124
# Attributes received by querying the RESTful API
114125
self.dataset_id = int(dataset_id) if dataset_id is not None else None

tests/test_datasets/test_dataset.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,24 @@ def setUp(self):
3131
def test_repr(self):
3232
# create a bare-bones dataset as would be returned by
3333
# create_dataset
34-
data = openml.datasets.OpenMLDataset(name="some name",
34+
data = openml.datasets.OpenMLDataset(name="somename",
3535
description="a description")
3636
str(data)
3737

38+
def test_init_string_validation(self):
39+
with pytest.raises(ValueError, match="Invalid symbols in name"):
40+
openml.datasets.OpenMLDataset(name="some name",
41+
description="a description")
42+
43+
with pytest.raises(ValueError, match="Invalid symbols in description"):
44+
openml.datasets.OpenMLDataset(name="somename",
45+
description="a descriptïon")
46+
47+
with pytest.raises(ValueError, match="Invalid symbols in citation"):
48+
openml.datasets.OpenMLDataset(name="somename",
49+
description="a description",
50+
citation="Something by Müller")
51+
3852
def test_get_data_array(self):
3953
# Basic usage
4054
rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format='array')

0 commit comments

Comments
 (0)