Skip to content

Commit d0deb6d

Browse files
fix: avoid stripping whitespaces for feature names (#1368)
* fix: minimal invasive change to avoid stripping whitespaces for feature names Co-authored-by: amastruserio <amastruserio@users.noreply.github.com> * fix: roll back change to work with older and newer xmltodict versions * add: test for whitespaces in features xml --------- Co-authored-by: amastruserio <amastruserio@users.noreply.github.com>
1 parent aa0aca0 commit d0deb6d

4 files changed

Lines changed: 32 additions & 8 deletions

File tree

openml/datasets/dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1077,7 +1077,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:
10771077

10781078

10791079
def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
1080-
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
1080+
xml_dict = xmltodict.parse(
1081+
features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
1082+
)
10811083
features_xml = xml_dict["oml:data_features"]
10821084

10831085
features: dict[int, OpenMLDataFeature] = {}

openml/study/functions.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def get_study(
7878
return study
7979

8080

81-
def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901
81+
def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
8282
xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
8383
force_list_tags = (
8484
"oml:data_id",
@@ -93,12 +93,6 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901
9393
alias = result_dict.get("oml:alias", None)
9494
main_entity_type = result_dict["oml:main_entity_type"]
9595

96-
# Parses edge cases where the server returns a string with a newline character for empty values.
97-
none_value_indicator = "\n "
98-
for key in result_dict:
99-
if result_dict[key] == none_value_indicator:
100-
result_dict[key] = None
101-
10296
if entity_type != main_entity_type:
10397
raise ValueError(
10498
f"Unexpected entity type '{main_entity_type}' reported by the server"
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<oml:data_features xmlns:oml="http://openml.org/openml">
2+
<oml:feature>
3+
<oml:index>0</oml:index>
4+
<oml:name>V1</oml:name>
5+
<oml:data_type>numeric</oml:data_type>
6+
<oml:is_target>false</oml:is_target>
7+
<oml:is_ignore>false</oml:is_ignore>
8+
<oml:is_row_identifier>false</oml:is_row_identifier>
9+
<oml:number_of_missing_values>0</oml:number_of_missing_values>
10+
</oml:feature>
11+
<oml:feature>
12+
<oml:index>1</oml:index>
13+
<oml:name>V42</oml:name>
14+
<oml:data_type>nominal</oml:data_type>
15+
<oml:nominal_value> - 50000.</oml:nominal_value>
16+
<oml:nominal_value> 50000+.</oml:nominal_value>
17+
<oml:is_target>false</oml:is_target>
18+
<oml:is_ignore>false</oml:is_ignore>
19+
<oml:is_row_identifier>false</oml:is_row_identifier>
20+
<oml:number_of_missing_values>0</oml:number_of_missing_values>
21+
</oml:feature>
22+
</oml:data_features>

tests/test_datasets/test_dataset_functions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,3 +1954,9 @@ def test_get_dataset_with_invalid_id() -> None:
19541954
with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
19551955
openml.datasets.get_dataset(INVALID_ID)
19561956
assert e.value.code == 111
1957+
1958+
def test_read_features_from_xml_with_whitespace() -> None:
1959+
from openml.datasets.dataset import _read_features
1960+
features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
1961+
dict = _read_features(features_file)
1962+
assert dict[1].nominal_values == [" - 50000.", " 50000+."]

0 commit comments

Comments
 (0)