Fix render_points datashader pipeline: dead code, silent failures, and fragile alignment (#560)

timtreis · web-flow · commit 525a523175d9 · 2026-03-27T12:40:51.000+01:00
diff --git a/src/spatialdata_plot/pl/_datashader.py b/src/spatialdata_plot/pl/_datashader.py
@@ -62,6 +62,11 @@ def _build_datashader_color_key(
     """Build a datashader ``color_key`` dict from a categorical series and its color vector."""
     na_hex = _hex_no_alpha(na_color_hex) if na_color_hex.startswith("#") else na_color_hex
     colors_arr = np.asarray(color_vector, dtype=object)
+    if len(colors_arr) != len(cat_series.codes):
+        logger.warning(
+            f"color_vector length ({len(colors_arr)}) does not match categorical series length "
+            f"({len(cat_series.codes)}); some categories may receive the na_color fallback."
+        )
     first_color: dict[str, str] = {}
     for code, color in zip(cat_series.codes, colors_arr, strict=False):
         if code < 0:
@@ -119,6 +124,11 @@ def _agg_call(element: Any, agg_func: Any) -> Any:
 
     if col_for_color is not None:
         if color_by_categorical:
+            if ds_reduction is not None:
+                logger.warning(
+                    f'ds_reduction="{ds_reduction}" is ignored for categorical data; '
+                    "categorical aggregation always uses count."
+                )
             transformed_element[col_for_color] = _inject_ds_nan_sentinel(transformed_element[col_for_color])
             agg = _agg_call(transformed_element, ds.by(col_for_color, ds.count()))
         else:
@@ -127,7 +137,9 @@ def _agg_call(element: Any, agg_func: Any) -> Any:
                 f'Using the datashader reduction "{reduction_name}". "max" will give an output '
                 "very close to the matplotlib result."
             )
-            agg = _datashader_aggregate_with_function(ds_reduction, cvs, transformed_element, col_for_color, geom_type)
+            agg = _datashader_aggregate_with_function(
+                reduction_name, cvs, transformed_element, col_for_color, geom_type
+            )
             reduction_bounds = (agg.min(), agg.max())
 
             nan_elements = transformed_element[transformed_element[col_for_color].isnull()]
@@ -244,7 +256,7 @@ def _ds_shade_categorical(
 ) -> Any:
     """Shade a categorical or no-color datashader aggregate."""
     ds_cmap = None
-    if color_vector is not None:
+    if color_key is None and color_vector is not None:
         ds_cmap = color_vector[0]
         if isinstance(ds_cmap, str) and ds_cmap[0] == "#":
             ds_cmap = _hex_no_alpha(ds_cmap)
diff --git a/src/spatialdata_plot/pl/render.py b/src/spatialdata_plot/pl/render.py
@@ -108,6 +108,20 @@ def _reparse_points(
     )
 
 
+def _warn_groups_ignored_continuous(
+    groups: str | list[str] | None,
+    color_source_vector: pd.Categorical | None,
+    col_for_color: str | None,
+) -> None:
+    """Warn when ``groups`` is set but coloring is continuous (no categorical source)."""
+    if groups is not None and color_source_vector is None and col_for_color is not None:
+        logger.warning(
+            f"`groups` is ignored when coloring by continuous column '{col_for_color}'. "
+            "`groups` filters categories of the column specified via `color`; "
+            "it has no effect on continuous data."
+        )
+
+
 def _warn_missing_groups(
     groups: str | list[str],
     color_source_vector: pd.Categorical,
@@ -329,6 +343,8 @@ def _render_shapes(
 
     values_are_categorical = color_source_vector is not None
 
+    _warn_groups_ignored_continuous(groups, color_source_vector, col_for_color)
+
     if groups is not None and color_source_vector is not None:
         _warn_missing_groups(groups, color_source_vector, col_for_color)
 
@@ -784,6 +800,8 @@ def _render_points(
     if added_color_from_table and col_for_color is not None:
         _reparse_points(sdata_filt, element, points_pd_with_color, transformation_in_cs, coordinate_system)
 
+    _warn_groups_ignored_continuous(groups, color_source_vector, col_for_color)
+
     if groups is not None and color_source_vector is not None:
         _warn_missing_groups(groups, color_source_vector, col_for_color)
 
@@ -1335,6 +1353,8 @@ def _render_labels(
         else:
             assert color_source_vector is None
 
+    _warn_groups_ignored_continuous(groups, color_source_vector, col_for_color)
+
     if groups is not None and color_source_vector is not None:
         _warn_missing_groups(groups, color_source_vector, col_for_color)
 
diff --git a/tests/pl/test_render_points.py b/tests/pl/test_render_points.py
@@ -1,6 +1,8 @@
+import logging
 import math
 
 import dask.dataframe
+import datashader as ds
 import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
@@ -23,6 +25,12 @@
 
 import spatialdata_plot  # noqa: F401
 from spatialdata_plot._logging import logger, logger_warns
+from spatialdata_plot.pl._datashader import (
+    _build_datashader_color_key,
+    _ds_aggregate,
+    _ds_shade_categorical,
+)
+from spatialdata_plot.pl.render import _warn_groups_ignored_continuous
 from tests.conftest import DPI, PlotTester, PlotTesterMeta, _viridis_with_under_over, get_standard_RNG
 
 sc.pl.set_rcParams_defaults()
@@ -741,3 +749,165 @@ def test_datashader_alpha_not_applied_twice(sdata_blobs: SpatialData):
             "on top of the alpha already in the RGBA channels — causing double transparency."
         )
     plt.close(fig)
+
+
+# ---------------------------------------------------------------------------
+# Tests for datashader pipeline fixes (parameter forwarding, warnings)
+# ---------------------------------------------------------------------------
+
+
+def _make_ds_canvas_and_df(n=500, seed=42):
+    """Small datashader Canvas + DataFrame with x, y, cat, val columns."""
+    rng = np.random.default_rng(seed)
+    df = pd.DataFrame(
+        {
+            "x": rng.uniform(-10, 10, n),
+            "y": rng.uniform(-10, 10, n),
+            "cat": pd.Categorical(rng.choice(["A", "B", "C"], n)),
+            "val": rng.normal(0, 1, n),
+        }
+    )
+    cvs = ds.Canvas(plot_width=50, plot_height=50, x_range=(-10, 10), y_range=(-10, 10))
+    return cvs, df
+
+
+def test_ds_aggregate_default_reduction_is_forwarded():
+    """default_reduction must affect the actual aggregation, not just the log message."""
+    cvs, df = _make_ds_canvas_and_df()
+    agg_sum, _, _ = _ds_aggregate(cvs, df.copy(), "val", False, None, "sum", "points")
+    agg_max, _, _ = _ds_aggregate(cvs, df.copy(), "val", False, None, "max", "points")
+    assert not np.allclose(
+        np.nan_to_num(agg_sum.values, nan=0),
+        np.nan_to_num(agg_max.values, nan=0),
+    )
+
+
+def test_ds_aggregate_default_reduction_equals_explicit():
+    """default_reduction='max' with ds_reduction=None must equal explicit ds_reduction='max'."""
+    cvs, df = _make_ds_canvas_and_df()
+    agg_default, _, _ = _ds_aggregate(cvs, df.copy(), "val", False, None, "max", "points")
+    agg_explicit, _, _ = _ds_aggregate(cvs, df.copy(), "val", False, "max", "max", "points")
+    np.testing.assert_array_equal(
+        np.nan_to_num(agg_default.values, nan=0),
+        np.nan_to_num(agg_explicit.values, nan=0),
+    )
+
+
+def test_ds_aggregate_explicit_overrides_default():
+    """Explicit ds_reduction takes precedence over default_reduction."""
+    cvs, df = _make_ds_canvas_and_df()
+    agg, _, _ = _ds_aggregate(cvs, df.copy(), "val", False, "max", "sum", "points")
+    agg_max, _, _ = _ds_aggregate(cvs, df.copy(), "val", False, "max", "max", "points")
+    np.testing.assert_array_equal(
+        np.nan_to_num(agg.values, nan=0),
+        np.nan_to_num(agg_max.values, nan=0),
+    )
+
+
+def test_ds_reduction_ignored_for_categorical(caplog):
+    """Categorical aggregation always uses ds.count(); a warning is emitted when ds_reduction is set."""
+    cvs, df = _make_ds_canvas_and_df()
+    with logger_warns(caplog, logger, match="ignored.*categorical"):
+        _ds_aggregate(cvs, df.copy(), "cat", True, "mean", "mean", "points")
+
+
+def test_ds_reduction_no_warning_when_none(caplog):
+    """No spurious warning when ds_reduction is None (the default)."""
+    cvs, df = _make_ds_canvas_and_df()
+    with caplog.at_level(logging.WARNING, logger=logger.name):
+        logger.addHandler(caplog.handler)
+        try:
+            _ds_aggregate(cvs, df.copy(), "cat", True, None, "sum", "points")
+        finally:
+            logger.removeHandler(caplog.handler)
+    assert not any("ignored" in r.message.lower() for r in caplog.records)
+
+
+@pytest.mark.parametrize("reduction", ["mean", "max", "min", "count", "std", "var"])
+def test_ds_reduction_categorical_always_uses_count(reduction):
+    """Categorical aggregation always uses ds.count(), regardless of ds_reduction (by design)."""
+    cvs, df = _make_ds_canvas_and_df()
+    base, _, _ = _ds_aggregate(cvs, df.copy(), "cat", True, "sum", "sum", "points")
+    agg, _, _ = _ds_aggregate(cvs, df.copy(), "cat", True, reduction, reduction, "points")
+    np.testing.assert_array_equal(agg.values, base.values)
+
+
+def test_groups_warns_when_continuous_points(sdata_blobs: SpatialData, caplog):
+    """Using groups with a continuous color column should warn."""
+    n = len(sdata_blobs["blobs_points"])
+    sdata_blobs["blobs_points"]["cont_val"] = pd.Series(list(range(n)), dtype=float)
+    with logger_warns(caplog, logger, match="groups.*ignored.*continuous"):
+        sdata_blobs.pl.render_points("blobs_points", color="cont_val", groups=["nonexistent"]).pl.show()
+
+
+def test_warn_groups_ignored_continuous_emits(caplog):
+    """_warn_groups_ignored_continuous emits when groups is set but data is continuous."""
+    with logger_warns(caplog, logger, match="ignored.*continuous"):
+        _warn_groups_ignored_continuous(["A"], None, "my_col")
+
+
+def test_warn_groups_ignored_continuous_silent_for_categorical(caplog):
+    """No warning when color_source_vector is present (categorical)."""
+    with caplog.at_level(logging.WARNING, logger=logger.name):
+        logger.addHandler(caplog.handler)
+        try:
+            _warn_groups_ignored_continuous(["A"], pd.Categorical(["A", "B"]), "cat_col")
+        finally:
+            logger.removeHandler(caplog.handler)
+    assert not any("ignored" in r.message for r in caplog.records)
+
+
+def test_color_key_warns_on_short_color_vector(caplog):
+    """Warning when color_vector is shorter than categorical series."""
+    cat = pd.Categorical(["A", "B", "C", "A", "B", "C", "A"])
+    with logger_warns(caplog, logger, match="color_vector length"):
+        result = _build_datashader_color_key(cat, ["#ff0000", "#00ff00", "#0000ff", "#ff0000", "#00ff00"], "#cccccc")
+    assert "A" in result and "B" in result and "C" in result
+
+
+def test_color_key_warns_on_long_color_vector(caplog):
+    """Warning when color_vector is longer than categorical series."""
+    cat = pd.Categorical(["A", "B"])
+    with logger_warns(caplog, logger, match="color_vector length"):
+        _build_datashader_color_key(cat, ["#ff0000", "#00ff00", "#0000ff", "#ffff00"], "#cccccc")
+
+
+def test_color_key_no_warning_when_lengths_match(caplog):
+    """No warning when lengths match."""
+    cat = pd.Categorical(["A", "B", "C"])
+    with caplog.at_level(logging.WARNING, logger=logger.name):
+        logger.addHandler(caplog.handler)
+        try:
+            _build_datashader_color_key(cat, ["#ff0000", "#00ff00", "#0000ff"], "#cccccc")
+        finally:
+            logger.removeHandler(caplog.handler)
+    assert not any("color_vector length" in r.message for r in caplog.records)
+
+
+def test_color_key_unseen_category_gets_na_color(caplog):
+    """Categories only appearing after the truncation point get na_color."""
+    cat = pd.Categorical(["A", "B", "A", "B", "A", "D"])
+    with logger_warns(caplog, logger, match="color_vector length"):
+        result = _build_datashader_color_key(cat, ["#ff0000", "#00ff00", "#ff0000", "#00ff00"], "#cccccc")
+    assert result["D"] == "#cccccc"
+
+
+def test_shade_categorical_color_key_overrides_cmap():
+    """When color_key is provided, different color_vector[0] values must produce identical output."""
+    cvs, df = _make_ds_canvas_and_df(n=100)
+    agg = cvs.points(df, "x", "y", agg=ds.by("cat", ds.count()))
+    color_key = {"A": "#ff0000", "B": "#00ff00", "C": "#0000ff"}
+
+    shaded1 = _ds_shade_categorical(agg, color_key, np.array(["#ff0000"] * 100), alpha=1.0)
+    shaded2 = _ds_shade_categorical(agg, color_key, np.array(["#0000ff"] * 100), alpha=1.0)
+    np.testing.assert_array_equal(np.asarray(shaded1), np.asarray(shaded2))
+
+
+def test_shade_categorical_cmap_used_when_no_color_key():
+    """When color_key is None (no color column), cmap from color_vector[0] affects output."""
+    cvs, df = _make_ds_canvas_and_df(n=100)
+    agg = cvs.points(df, "x", "y", agg=ds.count())
+    shaded_red = _ds_shade_categorical(agg, None, np.array(["#ff0000"] * 100), alpha=1.0)
+    shaded_blue = _ds_shade_categorical(agg, None, np.array(["#0000ff"] * 100), alpha=1.0)
+    # Different color_vector[0] values should produce different shaded output
+    assert not np.array_equal(np.asarray(shaded_red), np.asarray(shaded_blue))