Skip to content

Feature request - Support for filtering specific schemas for collection filter #239

@ereb2impact

Description

@ereb2impact

Hi,

An example of filtering specific schema could be:

from decimal import Decimal
import dataframely as dy
import polars as pl
class InvoiceIdSchema(dy.Schema):
    invoice_id = dy.String(primary_key=True)
class InvoiceSchema(InvoiceIdSchema):
    admission_date = dy.Date(nullable=False)
    discharge_date = dy.Date(nullable=False)
    received_at = dy.Datetime(nullable=False)
    amount = dy.Decimal(nullable=False, min_exclusive=Decimal(0))

    @dy.rule()
    def discharge_after_admission(cls) -> pl.Expr:
        return pl.col("discharge_date") >= pl.col("admission_date")

    @dy.rule()
    def received_at_after_discharge(cls) -> pl.Expr:
        return pl.col("received_at").dt.date() >= pl.col("discharge_date")
class DiagnosisSchema(InvoiceIdSchema):
    diagnosis_code = dy.String(primary_key=True, regex=r"[A-Z][0-9]{2,4}")
    diagnosis_date = dy.Date(nullable=False)
    is_main = dy.Bool(nullable=False)

    @dy.rule(group_by=["invoice_id"])
    def exactly_one_main_diagnosis(cls) -> pl.Expr:
        return pl.col("is_main").sum() == 1
class HospitalClaims(dy.Collection):
    invoices: dy.LazyFrame[InvoiceSchema]
    diagnoses: dy.LazyFrame[DiagnosisSchema]

    @dy.filter(members=["diagnoses"])  # Apply filter only to diagnoses member
    def diagnosis_date_after_admission(self) -> pl.LazyFrame:
        """
        Filter diagnoses to only include those where diagnosis_date >= admission_date.
        This requires joining with invoices to get admission_date.
        
        Returns: LazyFrame with valid diagnoses rows
        """
        return self.diagnoses.join(
            self.invoices.select(["invoice_id", "admission_date"]),
            on="invoice_id",
            how="left"
        ).filter(
            pl.col("diagnosis_date") >= pl.col("admission_date")
        )

    @dy.rule()  # Collection-level validation across members
    def diagnosis_before_discharge(self) -> pl.LazyFrame:
        """
        Validate that all diagnosis dates occur before or on discharge date.
        
        Returns: LazyFrame with rows that violate this rule (diagnosis_date > discharge_date)
        """
        invalid_rows = self.diagnoses.join(
            self.invoices.select(["invoice_id", "discharge_date"]),
            on="invoice_id",
            how="inner"
        ).filter(
            pl.col("diagnosis_date") > pl.col("discharge_date")
        )
        
        return invalid_rows

Ability to filter only specifc schema for filtering collection.

Metadata

Metadata

Labels

No labels
No labels

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions