Skip to content

Commit c617d6d

Browse files
Prtm2110PGijsbers
andauthored
[ENH] Adds ontology information to /datasets/feature (#255)
#237 --------- Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
1 parent f94808c commit c617d6d

4 files changed

Lines changed: 28 additions & 0 deletions

File tree

src/database/datasets.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Translation from https://github.com/openml/OpenML/blob/c19c9b99568c0fabb001e639ff6724b9a754bbc9/openml_OS/models/api/v1/Api_data.php#L707."""
22

33
import datetime
4+
from collections import defaultdict
45

56
from sqlalchemy import text
67
from sqlalchemy.engine import Row
@@ -134,6 +135,26 @@ async def get_features(dataset_id: int, connection: AsyncConnection) -> list[Fea
134135
return [Feature(**row, nominal_values=None) for row in rows]
135136

136137

138+
async def get_feature_ontologies(
139+
dataset_id: int,
140+
connection: AsyncConnection,
141+
) -> dict[int, list[str]]:
142+
rows = await connection.execute(
143+
text(
144+
"""
145+
SELECT `index`, `value`
146+
FROM data_feature_description
147+
WHERE `did` = :dataset_id AND `description_type` = 'ontology'
148+
""",
149+
),
150+
parameters={"dataset_id": dataset_id},
151+
)
152+
ontologies: dict[int, list[str]] = defaultdict(list)
153+
for row in rows.mappings():
154+
ontologies[row["index"]].append(row["value"])
155+
return ontologies
156+
157+
137158
async def get_feature_values(
138159
dataset_id: int,
139160
*,

src/routers/openml/datasets.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,10 @@ async def get_dataset_features(
294294
assert expdb is not None # noqa: S101
295295
await _get_dataset_raise_otherwise(dataset_id, user, expdb)
296296
features = await database.datasets.get_features(dataset_id, expdb)
297+
ontologies = await database.datasets.get_feature_ontologies(dataset_id, expdb)
298+
for feature in features:
299+
feature.ontology = ontologies.get(feature.index)
300+
297301
for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]:
298302
feature.nominal_values = await database.datasets.get_feature_values(
299303
dataset_id,

src/schemas/datasets/openml.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class Feature(BaseModel):
4040
index: int
4141
name: str
4242
data_type: FeatureType
43+
ontology: list[str] | None = None
4344
is_target: bool
4445
is_ignore: bool
4546
is_row_identifier: bool

tests/routers/openml/migration/datasets_migration_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ async def test_datasets_feature_is_identical(
259259
values = feature.pop(key)
260260
# The old API returns a str if there is only a single element
261261
feature["nominal_value"] = values if len(values) > 1 else values[0]
262+
elif key == "ontology":
263+
del feature[key] # Added back in with follow up PR #262
262264
else:
263265
# The old API formats bool as string in lower-case
264266
feature[key] = str(value) if not isinstance(value, bool) else str(value).lower()

0 commit comments

Comments
 (0)