Speed up datashader rendering of points (#557)

timtreis · web-flow · commit 44b07206a0e8 · 2026-03-27T14:03:00.000+01:00
diff --git a/src/spatialdata_plot/pl/_datashader.py b/src/spatialdata_plot/pl/_datashader.py
@@ -62,19 +62,27 @@ def _build_datashader_color_key(
     """Build a datashader ``color_key`` dict from a categorical series and its color vector."""
     na_hex = _hex_no_alpha(na_color_hex) if na_color_hex.startswith("#") else na_color_hex
     colors_arr = np.asarray(color_vector, dtype=object)
-    if len(colors_arr) != len(cat_series.codes):
+    categories = np.asarray(cat_series.categories, dtype=str)
+    codes = np.asarray(cat_series.codes)
+
+    if len(colors_arr) != len(codes):
         logger.warning(
-            f"color_vector length ({len(colors_arr)}) does not match categorical series length "
-            f"({len(cat_series.codes)}); some categories may receive the na_color fallback."
+            f"color_vector length ({len(color_vector)}) does not match categorical series length "
+            f"({len(codes)}); some categories may receive the na_color fallback."
         )
+
+    # Use np.unique to find the first occurrence of each category in one pass,
+    # avoiding a Python loop over all points.  See #379.
+    unique_codes, first_indices = np.unique(codes, return_index=True)
+
     first_color: dict[str, str] = {}
-    for code, color in zip(cat_series.codes, colors_arr, strict=False):
-        if code < 0:
+    for code, idx in zip(unique_codes, first_indices, strict=True):
+        if code < 0 or idx >= len(colors_arr):
             continue
-        cat_name = str(cat_series.categories[code])
-        if cat_name not in first_color:
-            first_color[cat_name] = _hex_no_alpha(color) if isinstance(color, str) and color.startswith("#") else color
-    return {str(c): first_color.get(str(c), na_hex) for c in cat_series.categories}
+        c = colors_arr[idx]
+        first_color[categories[code]] = _hex_no_alpha(c) if isinstance(c, str) and c.startswith("#") else c
+
+    return {cat: first_color.get(cat, na_hex) for cat in categories}
 
 
 def _inject_ds_nan_sentinel(series: pd.Series, sentinel: str = _DS_NAN_CATEGORY) -> pd.Series:
diff --git a/src/spatialdata_plot/pl/render.py b/src/spatialdata_plot/pl/render.py
@@ -51,6 +51,7 @@
 from spatialdata_plot.pl.utils import (
     _ax_show_and_transform,
     _convert_shapes,
+    _datashader_canvas_from_dataframe,
     _decorate_axs,
     _get_collection_shape,
     _get_colors_for_categorical_obs,
@@ -81,14 +82,15 @@ def _want_decorations(color_vector: Any, na_color: Color) -> bool:
     cv = np.asarray(color_vector)
     if cv.size == 0:
         return False
-    unique_vals = set(cv.tolist())
-    if len(unique_vals) != 1:
+    # Fast check: if any value differs from the first, there is variety → show decorations.
+    first = cv.flat[0]
+    if not (cv == first).all():
         return True
-    only_val = next(iter(unique_vals))
+    # All values are the same — suppress decorations when that value is the NA color.
     na_hex = na_color.get_hex()
-    if isinstance(only_val, str) and only_val.startswith("#") and na_hex.startswith("#"):
-        return _hex_no_alpha(only_val) != _hex_no_alpha(na_hex)
-    return bool(only_val != na_hex)
+    if isinstance(first, str) and first.startswith("#") and na_hex.startswith("#"):
+        return _hex_no_alpha(first) != _hex_no_alpha(na_hex)
+    return bool(first != na_hex)
 
 
 def _reparse_points(
@@ -782,6 +784,10 @@ def _render_points(
     # from the registered points (see above) avoids duplicate-origin ambiguities.
     color_table_name = table_name
 
+    # When color was already loaded from a table (line 690), pass it directly
+    # to avoid a redundant get_values() call inside _set_color_source_vec.
+    _preloaded = points_pd_with_color[col_for_color] if added_color_from_table and col_for_color is not None else None
+
     color_source_vector, color_vector, _ = _set_color_source_vec(
         sdata=sdata_filt,
         element=color_element,
@@ -795,6 +801,7 @@ def _render_points(
         table_name=color_table_name,
         render_type="points",
         coordinate_system=coordinate_system,
+        preloaded_color_data=_preloaded,
     )
 
     if added_color_from_table and col_for_color is not None:
@@ -846,15 +853,16 @@ def _render_points(
         # use dpi/100 as a factor for cases where dpi!=100
         px = int(np.round(np.sqrt(render_params.size) * (fig_params.fig.dpi / 100)))
 
-        # apply transformations
+        # Apply transformations and materialize to pandas immediately so
+        # datashader aggregates without dask scheduler overhead.  See #379.
         transformed_element = PointsModel.parse(
             trans.transform(sdata_filt.points[element][["x", "y"]]),
             annotation=sdata_filt.points[element][sdata_filt.points[element].columns.drop(["x", "y"])],
             transformations={coordinate_system: Identity()},
-        )
+        ).compute()
 
-        plot_width, plot_height, x_ext, y_ext, factor = _get_extent_and_range_for_datashader_canvas(
-            transformed_element, coordinate_system, ax, fig_params
+        plot_width, plot_height, x_ext, y_ext, factor = _datashader_canvas_from_dataframe(
+            transformed_element, ax, fig_params
         )
 
         # use datashader for the visualization of points
@@ -871,7 +879,7 @@ def _render_points(
                     if isinstance(color_source_vector, pd.Series)
                     else pd.Series(color_source_vector, index=series_index)
                 )
-                transformed_element = transformed_element.assign(col_for_color=source_series)
+                transformed_element[col_for_color] = source_series
             else:
                 if isinstance(color_vector, dd.Series):
                     color_vector = color_vector.compute()
@@ -880,8 +888,7 @@ def _render_points(
                     if isinstance(color_vector, pd.Series)
                     else pd.Series(color_vector, index=series_index)
                 )
-                transformed_element = transformed_element.assign(col_for_color=color_series)
-            transformed_element = transformed_element.rename(columns={"col_for_color": col_for_color})
+                transformed_element[col_for_color] = color_series
 
         color_dtype = transformed_element[col_for_color].dtype if col_for_color is not None else None
         color_by_categorical = col_for_color is not None and (
@@ -919,7 +926,7 @@ def _render_points(
             and isinstance(color_vector[0], str)
             and color_vector[0].startswith("#")
         ):
-            color_vector = np.asarray([_hex_no_alpha(x) for x in color_vector])
+            color_vector = np.asarray([_hex_no_alpha(c) for c in color_vector])
 
         nan_shaded = None
         if color_by_categorical or col_for_color is None:
diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py
@@ -1019,6 +1019,7 @@ def _set_color_source_vec(
     table_layer: str | None = None,
     render_type: Literal["points", "labels"] | None = None,
     coordinate_system: str | None = None,
+    preloaded_color_data: pd.Series | None = None,
 ) -> tuple[ArrayLike | pd.Series | None, ArrayLike, bool]:
     if value_to_plot is None and element is not None:
         color = np.full(len(element), na_color.get_hex_with_alpha())
@@ -1046,13 +1047,16 @@ def _set_color_source_vec(
                 element_name=element_name,
                 table_name=table_name,
             )
-        color_source_vector = get_values(
-            value_key=value_to_plot,
-            sdata=sdata,
-            element_name=element_name,
-            table_name=table_name,
-            table_layer=table_layer,
-        )[value_to_plot]
+        if preloaded_color_data is not None:
+            color_source_vector = preloaded_color_data
+        else:
+            color_source_vector = get_values(
+                value_key=value_to_plot,
+                sdata=sdata,
+                element_name=element_name,
+                table_name=table_name,
+                table_layer=table_layer,
+            )[value_to_plot]
 
         color_series = (
             color_source_vector if isinstance(color_source_vector, pd.Series) else pd.Series(color_source_vector)
@@ -2973,15 +2977,16 @@ def set_zero_in_cmap_to_transparent(cmap: Colormap | str, steps: int | None = No
     return ListedColormap(colors)
 
 
-def _get_extent_and_range_for_datashader_canvas(
-    spatial_element: SpatialElement,
-    coordinate_system: str,
+def _compute_datashader_canvas_params(
+    x_ext: list[Any],
+    y_ext: list[Any],
     ax: Axes,
     fig_params: FigParams,
 ) -> tuple[Any, Any, list[Any], list[Any], Any]:
-    extent = get_extent(spatial_element, coordinate_system=coordinate_system)
-    x_ext = [min(0, extent["x"][0]), extent["x"][1]]
-    y_ext = [min(0, extent["y"][0]), extent["y"][1]]
+    """Compute datashader canvas dimensions from spatial extents.
+
+    Shared logic used by both the dask-based and pandas-based entry points.
+    """
     previous_xlim = ax.get_xlim()
     previous_ylim = ax.get_ylim()
     # increase range if sth larger was rendered on the axis before
@@ -3015,6 +3020,33 @@ def _get_extent_and_range_for_datashader_canvas(
     return plot_width, plot_height, x_ext, y_ext, factor
 
 
+def _get_extent_and_range_for_datashader_canvas(
+    spatial_element: SpatialElement,
+    coordinate_system: str,
+    ax: Axes,
+    fig_params: FigParams,
+) -> tuple[Any, Any, list[Any], list[Any], Any]:
+    extent = get_extent(spatial_element, coordinate_system=coordinate_system)
+    x_ext = [min(0, extent["x"][0]), extent["x"][1]]
+    y_ext = [min(0, extent["y"][0]), extent["y"][1]]
+    return _compute_datashader_canvas_params(x_ext, y_ext, ax, fig_params)
+
+
+def _datashader_canvas_from_dataframe(
+    df: pd.DataFrame,
+    ax: Axes,
+    fig_params: FigParams,
+) -> tuple[Any, Any, list[Any], list[Any], Any]:
+    """Compute datashader canvas params directly from a pandas DataFrame.
+
+    Avoids the overhead of ``get_extent()`` (which requires a dask-backed
+    SpatialElement) by reading min/max from the already-materialised data.
+    """
+    x_ext = [min(0, float(df["x"].min())), float(df["x"].max())]
+    y_ext = [min(0, float(df["y"].min())), float(df["y"].max())]
+    return _compute_datashader_canvas_params(x_ext, y_ext, ax, fig_params)
+
+
 def _create_image_from_datashader_result(
     ds_result: ds.transfer_functions.Image | np.ndarray[Any, np.dtype[np.uint8]],
     factor: float,
diff --git a/tests/pl/test_render_points.py b/tests/pl/test_render_points.py
@@ -751,11 +751,6 @@ def test_datashader_alpha_not_applied_twice(sdata_blobs: SpatialData):
     plt.close(fig)
 
 
-# ---------------------------------------------------------------------------
-# Tests for datashader pipeline fixes (parameter forwarding, warnings)
-# ---------------------------------------------------------------------------
-
-
 def _make_ds_canvas_and_df(n=500, seed=42):
     """Small datashader Canvas + DataFrame with x, y, cat, val columns."""
     rng = np.random.default_rng(seed)
@@ -771,6 +766,29 @@ def _make_ds_canvas_and_df(n=500, seed=42):
     return cvs, df
 
 
+def test_datashader_points_categorical_with_nan(sdata_blobs: SpatialData):
+    """Datashader must handle categorical coloring with NaN values.
+
+    Regression test for https://github.com/scverse/spatialdata-plot/issues/379.
+    Exercises the optimised aggregation and color-key paths (pandas DataFrame
+    instead of dask, early-exit in _build_datashader_color_key).
+    """
+    n = 200
+    rng = get_standard_RNG()
+    cats = pd.Categorical(rng.choice(["A", "B", None], n))
+    points = sdata_blobs["blobs_points"].compute().head(n).copy()
+    points["cat"] = cats.astype("object")  # force object so PointsModel accepts it
+
+    sdata_blobs.points["test_pts"] = PointsModel.parse(points)
+
+    fig, ax = plt.subplots()
+    sdata_blobs.pl.render_points("test_pts", method="datashader", color="cat").pl.show(ax=ax)
+
+    axes_images = [c for c in ax.get_children() if isinstance(c, matplotlib.image.AxesImage)]
+    assert len(axes_images) > 0, "Datashader should produce at least one AxesImage"
+    plt.close(fig)
+
+
 def test_ds_aggregate_default_reduction_is_forwarded():
     """default_reduction must affect the actual aggregation, not just the log message."""
     cvs, df = _make_ds_canvas_and_df()