diff --git a/dataframely/columns/_base.py b/dataframely/columns/_base.py index 7b7ce78..abd9221 100644 --- a/dataframely/columns/_base.py +++ b/dataframely/columns/_base.py @@ -49,6 +49,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -79,6 +80,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if nullable and primary_key: raise ValueError("Nullable primary key columns are not supported.") @@ -89,6 +91,7 @@ def __init__( self.check = check self.alias = alias self.metadata = metadata + self.description = description # The name may be overridden by the schema on column access. self._name = "" @@ -277,7 +280,10 @@ def _pydantic_field_kwargs(self) -> dict[str, Any]: Returns: A dictionary of kwargs to pass to pydantic.Field. """ - return {} + kwargs: dict[str, Any] = {} + if self.description is not None: + kwargs["description"] = self.description + return kwargs # ------------------------------------ HELPER ------------------------------------ # @@ -362,6 +368,17 @@ def with_metadata(self, metadata: dict[str, Any]) -> Self: """ return self.with_properties(metadata=metadata) + def with_description(self, description: str) -> Self: + """Return a new column definition with the specified description. + + Args: + description: A human-readable description of the column. + + Returns: + A new column instance with the specified description. + """ + return self.with_properties(description=description) + # ----------------------------------- SAMPLING ----------------------------------- # def sample(self, generator: Generator, n: int = 1) -> pl.Series: @@ -436,7 +453,7 @@ def as_dict(self, expr: pl.Expr) -> dict[str, Any]: else getattr(self, param) ) for param in inspect.signature(self.__class__.__init__).parameters - if param not in ("self", "alias") + if param not in ("self", "alias", "description") }, } @@ -485,8 +502,9 @@ def matches(self, other: Column, expr: pl.Expr) -> bool: for attr in attributes.parameters # NOTE: We do not want to compare the `alias` here as the comparison should # only evaluate the type and its constraints. Names are checked in - # :meth:`Schema.matches`. - if attr not in ("self", "alias") + # :meth:`Schema.matches`. The `description` is also excluded as it is + # human-readable documentation rather than a semantic constraint. + if attr not in ("self", "alias", "description") ) def _attributes_match( @@ -506,7 +524,9 @@ def __repr__(self) -> str: self.__class__.__init__ ).parameters.items() if attribute - not in ["self", "alias"] # alias is always equal to the column name here + # alias is always equal to the column name here; description is + # human-readable documentation rather than a semantic constraint + not in ["self", "alias", "description"] and not ( # Do not include attributes that are set to their default value getattr(self, attribute) == param_details.default diff --git a/dataframely/columns/any.py b/dataframely/columns/any.py index 0974253..740407d 100644 --- a/dataframely/columns/any.py +++ b/dataframely/columns/any.py @@ -30,6 +30,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -53,6 +54,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=True, @@ -60,6 +62,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) @property diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index 48e1412..d00f2a4 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -39,6 +39,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -69,6 +70,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, @@ -77,6 +79,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.inner = inner self.shape = shape if isinstance(shape, tuple) else (shape,) diff --git a/dataframely/columns/categorical.py b/dataframely/columns/categorical.py index 71b7a56..aea7412 100644 --- a/dataframely/columns/categorical.py +++ b/dataframely/columns/categorical.py @@ -27,6 +27,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -59,6 +60,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, @@ -67,6 +69,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) @property diff --git a/dataframely/columns/datetime.py b/dataframely/columns/datetime.py index 5653e37..705a105 100644 --- a/dataframely/columns/datetime.py +++ b/dataframely/columns/datetime.py @@ -46,6 +46,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -88,6 +89,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if resolution is not None: offset_time = pl.Series([EPOCH_DATETIME]).dt.offset_by(resolution).dt.time() @@ -117,6 +119,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.resolution = resolution @@ -188,6 +191,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -230,6 +234,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if resolution is not None: offset_date = pl.Series([EPOCH_DATETIME]).dt.offset_by(resolution).dt.date() @@ -259,6 +264,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.resolution = resolution @@ -338,6 +344,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -384,6 +391,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if resolution is not None and min is not None: if not datetime_matches_resolution(min, resolution): @@ -409,6 +417,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.resolution = resolution self.time_zone = time_zone @@ -509,6 +518,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -552,6 +562,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if resolution is not None and min is not None: if not timedelta_matches_resolution(min, resolution): @@ -577,6 +588,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.resolution = resolution self.time_unit = time_unit diff --git a/dataframely/columns/decimal.py b/dataframely/columns/decimal.py index 6d1ea4c..a160c17 100644 --- a/dataframely/columns/decimal.py +++ b/dataframely/columns/decimal.py @@ -37,6 +37,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -77,6 +78,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if isinstance(min, int): min = decimal.Decimal(min) @@ -107,6 +109,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.precision = precision self.scale = scale diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index 1bc79c6..dea44cf 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -32,6 +32,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -66,6 +67,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, @@ -74,6 +76,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) if isclass(categories) and issubclass(categories, enum.Enum): categories = (item.value for item in categories) diff --git a/dataframely/columns/float.py b/dataframely/columns/float.py index 6dc6a02..77c2853 100644 --- a/dataframely/columns/float.py +++ b/dataframely/columns/float.py @@ -39,6 +39,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -79,6 +80,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if min is not None and min < self.min_value: raise ValueError("Minimum value is too small for the data type.") @@ -99,6 +101,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) @classproperty diff --git a/dataframely/columns/integer.py b/dataframely/columns/integer.py index bfde234..f7ceca7 100644 --- a/dataframely/columns/integer.py +++ b/dataframely/columns/integer.py @@ -35,6 +35,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -75,6 +76,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ if min is not None and min < self.min_value: raise ValueError("`min` is too small for the data type.") @@ -97,6 +99,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) @classproperty diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py index b45581e..8b8f4d8 100644 --- a/dataframely/columns/list.py +++ b/dataframely/columns/list.py @@ -41,6 +41,7 @@ def __init__( min_length: int | None = None, max_length: int | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -77,6 +78,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, @@ -85,6 +87,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.inner = inner self.min_length = min_length diff --git a/dataframely/columns/object.py b/dataframely/columns/object.py index f796e63..e157e95 100644 --- a/dataframely/columns/object.py +++ b/dataframely/columns/object.py @@ -25,6 +25,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -49,12 +50,14 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, check=check, alias=alias, metadata=metadata, + description=description, ) @property diff --git a/dataframely/columns/string.py b/dataframely/columns/string.py index 1e69f72..cce4523 100644 --- a/dataframely/columns/string.py +++ b/dataframely/columns/string.py @@ -33,6 +33,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -69,6 +70,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, @@ -77,6 +79,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.min_length = min_length self.max_length = max_length diff --git a/dataframely/columns/struct.py b/dataframely/columns/struct.py index acd0dc9..dca92b0 100644 --- a/dataframely/columns/struct.py +++ b/dataframely/columns/struct.py @@ -35,6 +35,7 @@ def __init__( check: Check | None = None, alias: str | None = None, metadata: dict[str, Any] | None = None, + description: str | None = None, ): """ Args: @@ -70,6 +71,7 @@ def __init__( this option does _not_ allow to refer to the column with two different names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. + description: A human-readable description of the column. """ super().__init__( nullable=nullable, @@ -78,6 +80,7 @@ def __init__( check=check, alias=alias, metadata=metadata, + description=description, ) self.inner = inner diff --git a/tests/columns/test_description.py b/tests/columns/test_description.py new file mode 100644 index 0000000..2525a92 --- /dev/null +++ b/tests/columns/test_description.py @@ -0,0 +1,99 @@ +# Copyright (c) QuantCo 2024-2026 +# SPDX-License-Identifier: BSD-3-Clause + +from typing import Annotated, get_args, get_origin + +import pytest + +import dataframely as dy +from dataframely._compat import pydantic +from dataframely.columns import Column +from dataframely.testing import ALL_COLUMN_TYPES + +pytestmark = pytest.mark.with_optionals + + +class SchemaWithDescription(dy.Schema): + a = dy.Int64(description="The number of widgets.") + b = dy.String() + + +def test_description_attribute() -> None: + # Act / Assert + assert SchemaWithDescription.a.description == "The number of widgets." + assert SchemaWithDescription.b.description is None + + +def test_with_description() -> None: + # Arrange + col = dy.Int64() + + # Act + updated = col.with_description("hello") + + # Assert + assert col.description is None + assert updated.description == "hello" + + +@pytest.mark.parametrize("column_type", ALL_COLUMN_TYPES) +def test_description_in_pydantic_field(column_type: type[Column]) -> None: + # Arrange + col = column_type(description="my description") + + # Act + field = col.pydantic_field() + + # Assert + assert get_origin(field) is Annotated or hasattr(field, "__metadata__") + metadata = get_args(field)[1:] + field_info = next((m for m in metadata if hasattr(m, "description")), None) + assert field_info is not None + assert field_info.description == "my description" + + +def test_description_propagated_through_model() -> None: + # Arrange + col = dy.Int64(description="The number of widgets.") + model = pydantic.create_model("Test", val=col.pydantic_field()) + + # Act + schema = model.model_json_schema() + + # Assert + assert schema["properties"]["val"]["description"] == "The number of widgets." + + +@pytest.mark.parametrize( + "col", + [ + dy.Object(description="my description"), + dy.List(dy.Int64(), description="my description"), + dy.Array(dy.Int64(), shape=2, description="my description"), + dy.Struct({"x": dy.Int64()}, description="my description"), + dy.Enum(["a", "b"], description="my description"), + ], +) +def test_description_for_compound_columns(col: Column) -> None: + # Act + field = col.pydantic_field() + + # Assert + assert col.description == "my description" + metadata = getattr(field, "__metadata__", ()) + field_info = next((m for m in metadata if hasattr(m, "description")), None) + assert field_info is not None + assert field_info.description == "my description" + + +def test_no_description_no_field_info() -> None: + # Arrange + col = dy.Bool() + + # Act + field = col.pydantic_field() + + # Assert + metadata = getattr(field, "__metadata__", ()) + for m in metadata: + assert getattr(m, "description", None) is None diff --git a/tests/columns/test_matches.py b/tests/columns/test_matches.py index 42c5f4f..c1466e7 100644 --- a/tests/columns/test_matches.py +++ b/tests/columns/test_matches.py @@ -19,6 +19,8 @@ (dy.Int32(), dy.Int32(), True), (dy.Int32(), dy.Int32(alias="foo"), True), (dy.Int32(alias="bar"), dy.Int32(alias="foo"), True), + (dy.Int32(), dy.Int32(description="foo"), True), + (dy.Int32(description="bar"), dy.Int32(description="foo"), True), (dy.String(regex="^a$"), dy.String(regex="^a$"), True), (dy.String(regex="^a$"), dy.String(regex="^b$"), False), (