diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c5cf390..0813f1fe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,6 +27,17 @@ repos: args: [--fix] - id: ruff-format + # Type-check with mypy --strict (config lives in pyproject [tool.mypy]). + # Pinned to the same major as CI's `mypy<2`. httpx/anyio are installed into + # the isolated hook env so their types resolve — without them mypy falls back + # to `Any` and mis-reports (the runtime deps aren't in the hook's venv). + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.20.2 + hooks: + - id: mypy + pass_filenames: false + additional_dependencies: [httpx, anyio] + # Strip cell outputs + execution_count from notebooks on commit so the # diff is the source, not the rendered run. Demos still execute fine # locally; clean commits keep PRs reviewable and avoid quota/timestamp diff --git a/README.md b/README.md index c1e8b6fe..d651be6c 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ Like the original R version [`dataRetrieval`](https://github.com/DOI-USGS/dataRetrieval), it retrieves major U.S. Geological Survey (USGS) hydrology data types available on the Web, as well -as data from the Water Quality Portal (WQP) and Network Linked Data Index -(NLDI). +as data from the Water Quality Portal (WQP), the National Ground-Water +Monitoring Network (NGWMN), and the Network Linked Data Index (NLDI). Check the [NEWS](NEWS.md) for all updates and announcements. @@ -85,7 +85,7 @@ stream sites in Maryland: ```python # Get monitoring location information df, metadata = waterdata.get_monitoring_locations( - state_name='Maryland', + state='Maryland', # full name, postal code ('MD'), or FIPS ('24') site_type_code='ST' # Stream sites ) diff --git a/dataretrieval/__init__.py b/dataretrieval/__init__.py index 9dfb1991..45bae98d 100644 --- a/dataretrieval/__init__.py +++ b/dataretrieval/__init__.py @@ -19,7 +19,9 @@ A failed request raises a subclass of :class:`dataretrieval.DataRetrievalError` (the taxonomy lives in ``dataretrieval.exceptions``); connection-level failures -(timeouts, DNS) are wrapped as :class:`dataretrieval.NetworkError`. +(timeouts, DNS) are wrapped as :class:`dataretrieval.NetworkError`. A large +request interrupted mid-stream raises :class:`dataretrieval.ChunkInterrupted`, +whose ``.call.resume()`` continues from the work already completed. """ from importlib.metadata import PackageNotFoundError, version @@ -42,9 +44,22 @@ URLTooLong, ) +# Resumable chunk-interruption exceptions. They are defined in +# ``dataretrieval.ogc.chunking`` rather than ``dataretrieval.exceptions`` +# because they carry pandas/httpx state and a resumable ``ChunkedCall`` handle, +# which would pull heavy dependencies into the lightweight exceptions module. +# Surfaced here so callers get a stable public path: +# ``from dataretrieval import ChunkInterrupted``. +from dataretrieval.ogc.chunking import ( + ChunkInterrupted, + QuotaExhausted, + ServiceInterrupted, +) + from . import ( exceptions, nadp, + ngwmn, nwis, samples, streamstats, @@ -56,6 +71,7 @@ __all__ = [ # service modules "nadp", + "ngwmn", "nwis", "samples", "streamstats", @@ -75,5 +91,9 @@ "TransientError", "URLTooLong", "Unchunkable", + # resumable chunk-interruption exceptions (defined in ogc.chunking) + "ChunkInterrupted", + "QuotaExhausted", + "ServiceInterrupted", "__version__", ] diff --git a/dataretrieval/codes/states.py b/dataretrieval/codes/states.py index 5d761736..8bb587ad 100644 --- a/dataretrieval/codes/states.py +++ b/dataretrieval/codes/states.py @@ -1,10 +1,18 @@ -"""State code lookups keyed by full state name. +"""State code lookups and normalization, keyed by full state name. ``state_codes`` maps each state name to its two-letter postal abbreviation (e.g. ``"Alabama": "al"``); ``fips_codes`` maps it to its two-digit FIPS -code (e.g. ``"Alabama": "01"``). +code (e.g. ``"Alabama": "01"``). :func:`to_state` normalizes a state +identifier -- a full name, postal code, or two-digit / ``US:``-prefixed FIPS +code (or an iterable of them) -- to a chosen representation, raising +``ValueError`` on an unrecognized value. Coverage is the 50 states plus the +District of Columbia. """ +from __future__ import annotations + +from collections.abc import Iterable + state_codes = { "Alabama": "al", "Alaska": "ak", @@ -112,3 +120,66 @@ "Wisconsin": "55", "Wyoming": "56", } + +# Reverse lookups (built once): postal code -> name, FIPS code -> name, and a +# case-insensitive full-name index. ``state_codes`` and ``fips_codes`` share the +# same keys, so any name resolved here is valid in both. +_name_by_postal = {code: name for name, code in state_codes.items()} +_name_by_fips = {fips: name for name, fips in fips_codes.items()} +_name_by_lower = {name.lower(): name for name in state_codes} + + +def to_state(value: str | Iterable[str], to: str = "name") -> str | list[str]: + """Normalize a US state/territory identifier to a chosen representation. + + ``value`` may be given as a full name (``"Wisconsin"``), a two-letter + postal code (``"WI"``), a two-digit ANSI/FIPS code (``"55"``), or a + prefixed FIPS code (``"US:55"``). The encodings are unambiguous: a value + prefixed ``US:`` or all-digits is a FIPS code, exactly two letters is a + postal code, anything else is matched (case-insensitively) as a full name. + An iterable of identifiers is resolved element-wise to a list. + + ``to`` selects the output representation: + + * ``"name"`` -> full name, e.g. ``"Wisconsin"`` + * ``"postal"`` -> uppercase two-letter code, e.g. ``"WI"`` + * ``"fips"`` -> two-digit ANSI/FIPS code, e.g. ``"55"`` + * ``"fips_us"`` -> ``"US:"`` + FIPS code, e.g. ``"US:55"`` + + Coverage is the 50 states plus the District of Columbia. A ``value`` that + isn't a recognized state in one of those encodings raises ``ValueError`` + (so a typo fails fast rather than silently matching nothing). + """ + if isinstance(value, str): + return _to_state_one(value, to) + return [_to_state_one(v, to) for v in value] + + +def _to_state_one(value: str, to: str) -> str: + """Resolve a single state identifier; see :func:`to_state`.""" + s = value.strip() + if s[:3].upper() == "US:": # prefixed FIPS, e.g. "US:55" + name = _name_by_fips.get(s[3:].strip().zfill(2)) + elif s.isdigit(): # bare FIPS, e.g. "55" + name = _name_by_fips.get(s.zfill(2)) + elif len(s) == 2 and s.isalpha(): # postal, e.g. "WI" + name = _name_by_postal.get(s.lower()) + else: # full name (case-insensitive) + name = _name_by_lower.get(s.lower()) + + if name is None: + raise ValueError( + f"{value!r} is not a recognized US state or the District of " + f'Columbia. Provide a full name ("Wisconsin"), a two-letter postal ' + f'code ("WI"), or a two-digit ANSI/FIPS code ("55").' + ) + + if to == "name": + return name + if to == "postal": + return state_codes[name].upper() + if to == "fips": + return fips_codes[name] + if to == "fips_us": + return f"US:{fips_codes[name]}" + raise ValueError(f"to must be 'name', 'postal', 'fips', or 'fips_us'; got {to!r}") diff --git a/dataretrieval/ngwmn.py b/dataretrieval/ngwmn.py new file mode 100644 index 00000000..879962c8 --- /dev/null +++ b/dataretrieval/ngwmn.py @@ -0,0 +1,422 @@ +"""National Ground-Water Monitoring Network (NGWMN) getters. + +The NGWMN exposes its data through a dedicated OGC API +(``https://api.waterdata.usgs.gov/ngwmn/ogcapi``) with five collections: +``sites``, ``waterLevelObs``, ``lithologyObs``, ``constructionObs``, and +``providers``. Each getter below delegates to the shared OGC engine +(:func:`~dataretrieval.ogc.engine.get_ogc_data`) with +``base_url=NGWMN_OGC_API_URL``, so multi-value chunking, pagination, +retry/resume, and result shaping all behave exactly as they do for the main +Water Data getters. + +Unlike the main Water Data collections, NGWMN aggregates monitoring locations +from many agencies, so ``monitoring_location_id`` values use other agency +prefixes besides ``USGS-`` (e.g. ``MBMG-702934``, ``AKDNR-535134236016630``). + +See https://api.waterdata.usgs.gov/ngwmn/ogcapi for the API reference. +""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +import pandas as pd + +from dataretrieval.codes.states import to_state +from dataretrieval.ogc.engine import BASE_URL, OgcDialect, _get_args, get_ogc_data +from dataretrieval.utils import BaseMetadata + +# The National Ground-Water Monitoring Network exposes its own OGC API at a +# separate, unversioned base. +NGWMN_OGC_API_URL = f"{BASE_URL}/ngwmn/ogcapi" + +# --- state-filter shim ------------------------------------------------------- +# NGWMN's collections expose DIFFERENT state queryables: ``sites`` filters on +# the full ``state_name`` (e.g. "Wisconsin"), while ``providers`` filters on the +# two-letter postal ``state`` (uppercase, e.g. "WI"). The state-aware getters +# take a single ``state`` parameter accepting any US-state encoding (full name, +# postal code, or FIPS code); ``_resolve_state`` normalizes it (via +# ``codes.states``) into the one queryable each collection wants. +# +# This shim exists only to smooth over that upstream asymmetry. +# ``tests/ngwmn_test.py::test_state_queryables_still_diverge_upstream`` fails — +# the signal to remove it — if the API ever unifies the two queryables. +_STATE_QUERYABLE = { + # service -> (upstream queryable name, to_state output format it expects) + "sites": ("state_name", "name"), + "providers": ("state", "postal"), +} + + +def _resolve_state(local_vars: dict[str, Any], service: str) -> None: + """Translate the user-facing ``state`` parameter into the single state + queryable the NGWMN ``service`` collection accepts, normalizing whichever + US-state encoding (name, postal, or FIPS) the caller used. + + Mutates ``local_vars`` in place; a no-op for getters that take no ``state``. + """ + given = local_vars.pop("state", None) + if given is None: + return + queryable, fmt = _STATE_QUERYABLE[service] + local_vars[queryable] = to_state(given, fmt) + + +# The NGWMN OGC API exposes the feature id under the generic ``id`` column +# (there is no service-specific id name as there is for the main collections). +_NGWMN_OUTPUT_ID = "id" + +# NGWMN's request shape matches the generic OGC default (no CQL2-only or +# date-only collections), but its result columns need their own coercion and +# sort vocabulary: water-level observations are timestamped by ``sample_time`` +# (not the Water Data ``time``) and report depths/levels in feet. +NGWMN_DIALECT = OgcDialect( + time_cols=frozenset({"sample_time"}), + numerical_cols=frozenset( + { + "water_depth_below_land_surface_ft", + "water_level_above_site_datum_ft", + "water_level_above_navd88_ft", + } + ), + sort_cols=("sample_time", "monitoring_location_id"), +) + + +def _get(service: str, local_vars: dict[str, Any]) -> tuple[pd.DataFrame, BaseMetadata]: + """Marshal a getter's arguments and dispatch to the shared OGC engine. + + Every NGWMN getter ends with this same call; centralizing it keeps the + NGWMN base URL, output id, and dialect wired up in exactly one place. + """ + _resolve_state(local_vars, service) + args = _get_args(local_vars) + return get_ogc_data( + args, + service, + output_id=_NGWMN_OUTPUT_ID, + base_url=NGWMN_OGC_API_URL, + dialect=NGWMN_DIALECT, + ) + + +def get_sites( + monitoring_location_id: str | Iterable[str] | None = None, + agency_code: str | Iterable[str] | None = None, + monitoring_location_number: str | Iterable[str] | None = None, + altitude: str | Iterable[str] | None = None, + national_aquifer_code: str | Iterable[str] | None = None, + national_aquifer_description: str | Iterable[str] | None = None, + country_code: str | Iterable[str] | None = None, + country_name: str | Iterable[str] | None = None, + state: str | Iterable[str] | None = None, + county_name: str | Iterable[str] | None = None, + aquifer_name: str | Iterable[str] | None = None, + site_type: str | Iterable[str] | None = None, + aquifer_type_code: str | Iterable[str] | None = None, + qw_sys_name: str | Iterable[str] | None = None, + qw_sn_flag: str | Iterable[str] | None = None, + qw_baseline_flag: str | Iterable[str] | None = None, + qw_well_chars: str | Iterable[str] | None = None, + qw_well_type: str | Iterable[str] | None = None, + qw_well_purpose: str | Iterable[str] | None = None, + wl_sys_name: str | Iterable[str] | None = None, + wl_sn_flag: str | Iterable[str] | None = None, + wl_baseline_flag: str | Iterable[str] | None = None, + wl_well_chars: str | Iterable[str] | None = None, + wl_well_type: str | Iterable[str] | None = None, + wl_well_purpose: str | Iterable[str] | None = None, + properties: str | Iterable[str] | None = None, + skip_geometry: bool | None = None, + bbox: list[float] | None = None, + limit: int | None = None, + convert_type: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get NGWMN monitoring-location (site) metadata. + + Site records describe each NGWMN monitoring location — its identifier, + responsible agency, location, aquifer, and whether it participates in the + network's water-quality (``qw_*``) and water-level (``wl_*``) sub-networks. + + Parameters + ---------- + monitoring_location_id : str or iterable of str, optional + One or more agency-qualified site identifiers in ``AGENCY-ID`` form + (e.g. ``"USGS-423114090161101"``, ``"MBMG-702934"``). + agency_code : str or iterable of str, optional + Code of the agency that manages the site. + monitoring_location_number : str or iterable of str, optional + Agency-assigned site number. + altitude : str or iterable of str, optional + Land-surface altitude at the site. + national_aquifer_code, national_aquifer_description : str or iterable, optional + National aquifer code / description. + country_code, country_name : str or iterable, optional + Country filters. + state : str or iterable of str, optional + State/territory filter. Accepts a full name (``"Wisconsin"``), a + two-letter postal code (``"WI"``), or a two-digit ANSI/FIPS code + (``"55"``). + county_name : str or iterable of str, optional + County name filter. + aquifer_name, site_type, aquifer_type_code : str or iterable, optional + Aquifer name, site type, and aquifer-type code. + qw_sys_name, qw_sn_flag, qw_baseline_flag : str or iterable, optional + Water-quality sub-network membership flags. + qw_well_chars, qw_well_type, qw_well_purpose : str or iterable, optional + Water-quality well characteristics, type, and purpose. + wl_sys_name, wl_sn_flag, wl_baseline_flag : str or iterable, optional + Water-level sub-network membership flags. + wl_well_chars, wl_well_type, wl_well_purpose : str or iterable, optional + Water-level well characteristics, type, and purpose. + properties : str or iterable of str, optional + Subset of columns to return. ``None`` (default) returns all columns. + skip_geometry : bool, optional + When ``True``, omit the geometry column. ``None`` (default) leaves the + server default (geometry included). + bbox : list of float, optional + Bounding box ``[minx, miny, maxx, maxy]`` (CRS 4326) to spatially + filter sites. + limit : int, optional + Per-page size; pagination still follows ``next`` links to completion. + convert_type : bool, optional + Whether to coerce column dtypes (default ``True``). + + Returns + ------- + pandas.DataFrame or geopandas.GeoDataFrame + Site metadata, one row per monitoring location. + BaseMetadata + Metadata object with the request URL and query time. + + Examples + -------- + .. code:: + + >>> # All NGWMN sites in Wisconsin + >>> # state accepts a full name, postal code ("WI"), or FIPS ("55") + >>> df, md = dataretrieval.ngwmn.get_sites(state="Wisconsin") + + >>> # Specific sites, geometry omitted + >>> df, md = dataretrieval.ngwmn.get_sites( + ... monitoring_location_id=["USGS-423114090161101", "MBMG-702934"], + ... skip_geometry=True, + ... ) + """ + return _get("sites", locals()) + + +def get_water_level( + monitoring_location_id: str | Iterable[str] | None = None, + monitoring_location_obs_number: str | Iterable[str] | None = None, + sample_time: str | Iterable[str] | None = None, + data_provided_by: str | Iterable[str] | None = None, + water_depth_below_land_surface_ft: str | Iterable[str] | None = None, + water_level_above_site_datum_ft: str | Iterable[str] | None = None, + monitoring_location_vertical_datum: str | Iterable[str] | None = None, + water_level_above_navd88_ft: str | Iterable[str] | None = None, + datetime: str | Iterable[str] | None = None, + properties: str | Iterable[str] | None = None, + limit: int | None = None, + convert_type: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get NGWMN water-level observations. + + Parameters + ---------- + monitoring_location_id : str or iterable of str, optional + One or more agency-qualified site identifiers (``AGENCY-ID`` form). + monitoring_location_obs_number : str or iterable of str, optional + Per-site observation number; use to subset a site's observations. + sample_time : str or iterable of str, optional + Exact sample-time value(s) to match. For a time *range*, use + ``datetime`` instead. + data_provided_by : str or iterable of str, optional + Source organization for the observation. + water_depth_below_land_surface_ft : str or iterable, optional + Depth-to-water value filter (feet below land surface). + water_level_above_site_datum_ft : str or iterable, optional + Water-level value filter (feet above the site datum). + water_level_above_navd88_ft : str or iterable, optional + Water-level value filter (feet above NAVD 88). + monitoring_location_vertical_datum : str or iterable of str, optional + Vertical datum of the reported water level. + datetime : str or iterable of str, optional + Temporal filter — a single instant or a two-element ``[start, end]`` + range (ISO-8601 dates/datetimes); ``".."`` denotes an open end. + properties : str or iterable of str, optional + Subset of columns to return. ``None`` (default) returns all columns. + limit : int, optional + Per-page size; pagination still follows ``next`` links to completion. + convert_type : bool, optional + Whether to coerce column dtypes (default ``True``). + + Returns + ------- + pandas.DataFrame + Water-level observations, one row per measurement. + BaseMetadata + Metadata object with the request URL and query time. + + Examples + -------- + .. code:: + + >>> site = "USGS-272838082142201" + >>> df, md = dataretrieval.ngwmn.get_water_level( + ... monitoring_location_id=site + ... ) + + >>> # Restrict to a date range + >>> df, md = dataretrieval.ngwmn.get_water_level( + ... monitoring_location_id=site, datetime=["2022-01-01", "2024-01-01"] + ... ) + + >>> # Multiple sites across agencies + >>> df, md = dataretrieval.ngwmn.get_water_level( + ... monitoring_location_id=["USGS-272838082142201", "MBMG-702934"] + ... ) + """ + return _get("waterLevelObs", locals()) + + +def get_lithology( + monitoring_location_id: str | Iterable[str] | None = None, + monitoring_location_obs_number: str | Iterable[str] | None = None, + properties: str | Iterable[str] | None = None, + limit: int | None = None, + convert_type: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get NGWMN lithology observations. + + Lithology records describe the geologic materials logged at a monitoring + location, with depth intervals and controlled lithology concepts. + + Parameters + ---------- + monitoring_location_id : str or iterable of str, optional + One or more agency-qualified site identifiers (``AGENCY-ID`` form). + monitoring_location_obs_number : str or iterable of str, optional + Per-site observation number; use to subset a site's records. + properties : str or iterable of str, optional + Subset of columns to return. ``None`` (default) returns all columns. + limit : int, optional + Per-page size; pagination still follows ``next`` links to completion. + convert_type : bool, optional + Whether to coerce column dtypes (default ``True``). + + Returns + ------- + pandas.DataFrame + Lithology observations, one row per logged interval. + BaseMetadata + Metadata object with the request URL and query time. + + Examples + -------- + .. code:: + + >>> df, md = dataretrieval.ngwmn.get_lithology( + ... monitoring_location_id="AKDNR-535134236016630" + ... ) + """ + return _get("lithologyObs", locals()) + + +def get_well_construction( + monitoring_location_id: str | Iterable[str] | None = None, + monitoring_location_obs_number: str | Iterable[str] | None = None, + material: str | Iterable[str] | None = None, + properties: str | Iterable[str] | None = None, + limit: int | None = None, + convert_type: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get NGWMN well-construction observations. + + Construction records describe a well's physical build-out — casing, + screens, and similar elements — with depth intervals, materials, and + diameters. + + Parameters + ---------- + monitoring_location_id : str or iterable of str, optional + One or more agency-qualified site identifiers (``AGENCY-ID`` form). + monitoring_location_obs_number : str or iterable of str, optional + Per-site observation number; use to subset a site's records. + material : str or iterable of str, optional + Construction-material filter. + properties : str or iterable of str, optional + Subset of columns to return. ``None`` (default) returns all columns. + limit : int, optional + Per-page size; pagination still follows ``next`` links to completion. + convert_type : bool, optional + Whether to coerce column dtypes (default ``True``). + + Returns + ------- + pandas.DataFrame + Well-construction observations, one row per construction element. + BaseMetadata + Metadata object with the request URL and query time. + + Examples + -------- + .. code:: + + >>> df, md = dataretrieval.ngwmn.get_well_construction( + ... monitoring_location_id="USGS-272838082142201" + ... ) + """ + return _get("constructionObs", locals()) + + +def get_providers( + state: str | Iterable[str] | None = None, + agency_code: str | Iterable[str] | None = None, + organization_type: str | Iterable[str] | None = None, + properties: str | Iterable[str] | None = None, + limit: int | None = None, + convert_type: bool = True, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Get NGWMN data-provider records. + + Providers are the organizations that contribute data to the network. + + Parameters + ---------- + state : str or iterable of str, optional + State/territory filter. Accepts a full name (``"Wisconsin"``), a + two-letter postal code (``"WI"``), or a two-digit ANSI/FIPS code + (``"55"``). Only one state at a time — a multi-value state filter + returns no records for this collection. + agency_code : str or iterable of str, optional + Provider agency code. + organization_type : str or iterable of str, optional + Provider organization type, e.g. ``"NWIS"``. + properties : str or iterable of str, optional + Subset of columns to return. ``None`` (default) returns all columns. + limit : int, optional + Per-page size; pagination still follows ``next`` links to completion. + convert_type : bool, optional + Whether to coerce column dtypes (default ``True``). + + Returns + ------- + pandas.DataFrame + Provider records, one row per provider. + BaseMetadata + Metadata object with the request URL and query time. + + Examples + -------- + .. code:: + + >>> df, md = dataretrieval.ngwmn.get_providers(state="WI") + + >>> # a full name (or FIPS code) works too + >>> df, md = dataretrieval.ngwmn.get_providers( + ... organization_type="NWIS", state="Wisconsin" + ... ) + """ + return _get("providers", locals()) diff --git a/dataretrieval/ogc/__init__.py b/dataretrieval/ogc/__init__.py new file mode 100644 index 00000000..6e259bb5 --- /dev/null +++ b/dataretrieval/ogc/__init__.py @@ -0,0 +1 @@ +"""Generic OGC API engine shared by the Water Data and NGWMN getters.""" diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/ogc/chunking.py similarity index 97% rename from dataretrieval/waterdata/chunking.py rename to dataretrieval/ogc/chunking.py index c0cb1cb3..5f41e8e2 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/ogc/chunking.py @@ -1,6 +1,6 @@ -"""Joint URL-byte chunking for the Water Data OGC getters. +"""Joint URL-byte chunking for the OGC getters. -A Water Data query has several chunkable axes: every multi-value list +An OGC query has several chunkable axes: every multi-value list parameter (sites, parameter codes, …) plus the cql-text ``filter``, which splits along its top-level OR clauses. Any of them can fan the URL past the server's ~8 KB byte limit. ``ChunkPlan`` picks a fan-out @@ -62,7 +62,7 @@ import random from collections.abc import Awaitable, Callable, Iterator from contextlib import contextmanager, suppress -from contextvars import ContextVar +from contextvars import ContextVar, copy_context from dataclasses import dataclass from datetime import timedelta from typing import Any, ClassVar, cast @@ -81,7 +81,7 @@ ) from dataretrieval.utils import HTTPX_DEFAULTS -from . import _progress +from . import progress as _progress from .filters import ( _check_numeric_filter_pitfall, _is_chunkable, @@ -91,7 +91,7 @@ # Empirically the API replies HTTP 414 above ~8200 bytes of full URL — # matches nginx's default ``large_client_header_buffers`` of 8 KB. 8000 # leaves ~200 bytes for request-line framing and proxy variance. -_WATERDATA_URL_BYTE_LIMIT = 8000 +_OGC_URL_BYTE_LIMIT = 8000 # Any list-shaped kwarg with >1 element is chunked (comma-joined per # sub-list in the URL); ~90 OGC params qualify, so we denylist the few @@ -358,7 +358,7 @@ def get_active_client() -> httpx.AsyncClient | None: Return the chunker's currently-published client, or ``None``. Used by the paginated-loop helpers (e.g. - :func:`dataretrieval.waterdata.utils._client_for`) to reuse the + :func:`dataretrieval.ogc.engine._client_for`) to reuse the per-call connection pool. Returns @@ -449,11 +449,12 @@ class ChunkInterrupted(DataRetrievalError): .. code-block:: python import time - from dataretrieval.waterdata import get_daily - from dataretrieval.waterdata.chunking import ChunkInterrupted + from dataretrieval import ChunkInterrupted + # ``getter`` is any chunked OGC getter — e.g. + # ``waterdata.get_daily`` or ``ngwmn.get_water_level``. try: - df, md = get_daily(monitoring_location_id=long_list_of_sites) + df, md = getter(monitoring_location_id=long_list_of_sites) except ChunkInterrupted as exc: while True: time.sleep(exc.retry_after or 5 * 60) @@ -1367,6 +1368,15 @@ def __init__( self.fetch = fetch self.retry_policy = retry_policy self.finalize = finalize + # Snapshot the ambient context at construction time — i.e. inside the + # caller's ``with`` blocks (base URL, dialect, row cap, progress + # reporter). :meth:`resume` runs every drive inside this snapshot, so + # a *later* ``exc.call.resume()`` — which fires after those ``with`` + # blocks have exited and reset their ContextVars — still rebuilds + # sub-requests against the original API's base URL/dialect rather than + # the process defaults. ``build_request`` reads those ContextVars when + # it reconstructs each sub-request, so the snapshot must outlive them. + self._ctx = copy_context() # Completed (frame, response) pairs keyed by sub-args index; sparse # (gathered sub-requests complete out of order — see class docstring). # ``_run``'s ``track`` closure is the only writer, so ``dict`` insertion @@ -1534,6 +1544,17 @@ def resume(self) -> tuple[pd.DataFrame, Any]: handle is on ``exc.call`` — wait for the underlying condition to clear and call ``exc.call.resume()`` again. """ + # Drive inside the snapshot taken at construction (see ``__init__``). + # ``start_blocking_portal`` copies the *calling* context into its + # worker thread, and running here means that calling context is the + # snapshot — so the base URL / dialect / row cap / progress reporter + # active when the call was created reach the rebuilt sub-requests, + # even when this is a resume fired long after the original ``with`` + # blocks exited. + return self._ctx.run(self._resume_in_context) + + def _resume_in_context(self) -> tuple[pd.DataFrame, Any]: + """Body of :meth:`resume`, run inside the captured context.""" concurrency = _read_concurrency_env() with start_blocking_portal() as portal: # ``portal.call`` returns ``Any`` because ``functools.partial`` @@ -1709,7 +1730,7 @@ def multi_value_chunked( measure each candidate plan. url_limit : int, optional Byte budget for the request (URL + body). When ``None`` - (default), the module-level ``_WATERDATA_URL_BYTE_LIMIT`` is + (default), the module-level ``_OGC_URL_BYTE_LIMIT`` is resolved at call time so test patches via ``monkeypatch.setattr`` take effect. @@ -1742,7 +1763,7 @@ def wrapper( *, finalize: _Finalize = _passthrough_result, ) -> tuple[pd.DataFrame, Any]: - limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit + limit = _OGC_URL_BYTE_LIMIT if url_limit is None else url_limit plan = ChunkPlan(args, build_request, limit) retry_policy = RetryPolicy.from_env() # The concurrency cap is resolved inside ``resume()`` from diff --git a/dataretrieval/ogc/engine.py b/dataretrieval/ogc/engine.py new file mode 100644 index 00000000..5be6ed38 --- /dev/null +++ b/dataretrieval/ogc/engine.py @@ -0,0 +1,1937 @@ +"""Generic OGC API engine shared by the Water Data and NGWMN getters. + +This module holds the API-agnostic machinery for talking to an OGC API +Features service: request construction (GET comma-joined or POST/CQL2), +async pagination, response shaping, and the chunked fetch entry point +:func:`get_ogc_data`. It is deliberately free of any Water-Data-specific +constants so a sibling package (e.g. NGWMN) can drive it without importing +``dataretrieval.waterdata``. + +API-specific behavior is supplied by the caller: + +* ``output_id`` — the user-facing column the wire ``id`` is renamed to, + passed explicitly (no service map lives here). +* ``base_url`` — the OGC API base to target. +* ``extra_id_cols`` — synthetic id columns to push to the end of a result. +* ``dialect`` — an :class:`OgcDialect` describing which services need + POST/CQL2 and which use date-only (vs. full datetime) time arguments. +""" + +from __future__ import annotations + +import copy +import functools +import json +import logging +import numbers +import os +import re +from collections.abc import ( + AsyncIterator, + Awaitable, + Callable, + Iterable, + Iterator, + Mapping, + Sequence, +) +from contextlib import asynccontextmanager, contextmanager +from contextvars import ContextVar +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Any, TypeVar, cast +from zoneinfo import ZoneInfo + +import httpx +import pandas as pd +from anyio.from_thread import start_blocking_portal + +from dataretrieval import __version__ +from dataretrieval.exceptions import DataRetrievalError, RateLimited, error_for_status +from dataretrieval.ogc import chunking +from dataretrieval.ogc import progress as _progress +from dataretrieval.ogc.chunking import ( + _QUOTA_HEADER, + _safe_elapsed, + get_active_client, +) +from dataretrieval.utils import HTTPX_DEFAULTS, BaseMetadata, _get, _network_error + +try: + import geopandas as gpd + + GEOPANDAS = True +except ImportError: + GEOPANDAS = False + +# Set up logger for this module +logger = logging.getLogger(__name__) + +# Whether geopandas is present is a static, environment-level fact, so warn once +# here at import time rather than per query/chunk. That avoids the warning +# repeating on every call and avoids it interleaving with the progress line's +# carriage-return rewrites. +if not GEOPANDAS: + logger.warning( + "Geopandas not installed. Geometries will be flattened into pandas DataFrames." + ) + +BASE_URL = "https://api.waterdata.usgs.gov" +OGC_API_VERSION = "v0" +OGC_API_URL = f"{BASE_URL}/ogcapi/{OGC_API_VERSION}" + + +@dataclass(frozen=True) +class OgcDialect: + """Per-API quirks the generic request builder needs to know about. + + Attributes + ---------- + cql2_services : frozenset[str] + Collections that don't accept comma-separated multi-value GET + parameters and so must be queried via POST with a CQL2 JSON body. + date_only_services : frozenset[str] + Collections whose time arguments are rendered date-only + (``YYYY-MM-DD``) rather than as a full UTC datetime. The + ``last_modified`` parameter is always rendered as a full datetime + regardless of this set. + time_cols : frozenset[str] + Result columns to coerce to datetime when ``convert_type`` is set. + Empty by default, so the generic engine carries no API-specific + column knowledge; each API supplies its own. + numerical_cols : frozenset[str] + Result columns to coerce to numeric when ``convert_type`` is set. + sort_cols : tuple[str, ...] + Columns to sort the combined result by, in priority order. Sorting + is applied only when the first (primary) column is present; any + later columns also present are added as secondary keys. + """ + + cql2_services: frozenset[str] = field(default_factory=frozenset) + date_only_services: frozenset[str] = field(default_factory=frozenset) + time_cols: frozenset[str] = field(default_factory=frozenset) + numerical_cols: frozenset[str] = field(default_factory=frozenset) + sort_cols: tuple[str, ...] = field(default_factory=tuple) + + +# Default dialect: a plain OGC API with no CQL2-only collections and no +# date-only collections (every time argument rendered as a full UTC datetime). +_DEFAULT_DIALECT = OgcDialect() + + +def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str) -> dict[str, Any]: + """ + Switch argument id from its package-specific identifier to the standardized "id" key + that the API recognizes. + + If `ls` does not already have an "id" key, sets it from either the + service-derived id key or the expected id column name. If neither key + exists, "id" is left unset. The original service-specific id keys are + removed regardless. + + Parameters + ---------- + ls : Dict[str, Any] + The dictionary containing identifier keys to be standardized. + id_name : str + The name of the specific identifier key to look for. + service : str + The service name. + + Returns + ------- + Dict[str, Any] + The modified dictionary with the "id" key set appropriately. + + Examples + -------- + For service "time-series-metadata", the function will look for either + "time_series_metadata_id" or "time_series_id" and change the key to simply + "id". + """ + + service_id = service.replace("-", "_") + "_id" + + if "id" not in ls: + if service_id in ls: + ls["id"] = ls[service_id] + elif id_name in ls: + ls["id"] = ls[id_name] + + # Remove the original keys regardless of whether they were used + ls.pop(service_id, None) + ls.pop(id_name, None) + + return ls + + +def _switch_properties_id( + properties: list[str] | None, id_name: str, service: str +) -> list[str]: + """ + Build the wire ``properties`` list, dropping every id alias and + ``geometry``. + + The feature ``id`` is always returned and is renamed to the + service-specific id column (e.g. ``daily_id``) in post-processing, so + it must not be requested as a property: several collections (e.g. + ``daily``, ``continuous``) reject ``id`` in ``properties`` with an + HTTP 400. ``geometry`` is likewise excluded because it is controlled + by ``skip_geometry``. Any service-specific id name (``daily_id``, + ``monitoring_location_id``, …) and the bare ``id`` are dropped, and + remaining hyphens are normalized to underscores. Returns an empty + list when `properties` is empty or None — the URL then omits the + ``properties`` filter and the result is shaped by :func:`_arrange_cols`. + + Parameters + ---------- + properties : Optional[List[str]] + A list containing the properties or column names to be pulled from the + service, or None. + id_name : str + The service-specific id column name to drop (e.g. ``daily_id``). + service : str + The service name. + + Returns + ------- + List[str] + The wire ``properties`` with id aliases and ``geometry`` removed + and hyphens normalized. + + Examples + -------- + For service "daily" with ``properties=["daily_id", "value", "geometry"]``, + returns ``["value"]`` — ``daily_id`` and ``geometry`` are dropped, while + the ``daily_id`` column still appears in the result, renamed from the + always-returned feature ``id``. + """ + if not properties: + return [] + service_id = service.replace("-", "_") + "_id" + # The feature ``id`` always comes back (renamed to the service id + # downstream) and several collections reject it as a selectable + # property; ``geometry`` is controlled by ``skip_geometry``. Drop both, + # plus the service-specific id column (``id_name``) and the name derived + # straight from the service (``service_id``). + drop = {"id", "geometry", id_name, service_id} + normalized = (p.replace("-", "_") for p in properties) + return [p for p in normalized if p not in drop] + + +_DATETIME_FORMATS = ( + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d %H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", +) + +# Anchored to ``[Pp]\d`` so a normal word containing ``p`` (e.g. ``"Apr"``) +# doesn't get mis-classified as an ISO 8601 duration; the optional ``T`` +# admits time-only forms like ``PT36H``. +_DURATION_RE = re.compile(r"^[Pp]T?\d") + +# OGC API parameters that carry a date/datetime value (single string, +# two-element range, or interval/duration string) rather than a multi-value +# string list. Used by ``_construct_api_requests`` to keep them out of the +# POST/CQL2 multi-value path and to route them through ``_format_api_dates``, +# and by the default ``_get_args`` no-normalize set to bypass string-iterable +# normalization. +_DATE_RANGE_PARAMS = frozenset( + {"datetime", "last_modified", "begin", "begin_utc", "end", "end_utc", "time"} +) + + +def _parse_datetime(value: str) -> datetime | None: + """Parse a single datetime string against the supported formats. + + Returns a ``datetime`` (tz-aware iff the input carried a UTC offset), + or ``None`` if no format matched. + """ + # ``datetime.strptime`` accepts a numeric offset like ``+00:00`` but not + # the ``Z`` shorthand, so normalize trailing ``Z`` first. + candidate = value[:-1] + "+00:00" if value.endswith("Z") else value + for fmt in _DATETIME_FORMATS: + try: + return datetime.strptime(candidate, fmt) + except ValueError: + continue + return None + + +def _format_one(dt: str | None, *, date: bool) -> str | None: + """Format a single datetime element for inclusion in the API time arg.""" + if pd.isna(dt) or dt == "" or dt is None: + return ".." + parsed = _parse_datetime(dt) + if parsed is None: + return None + if date: + return parsed.strftime("%Y-%m-%d") + # Naive inputs are interpreted in the system local zone (for backwards + # compatibility). Use ``.astimezone()`` rather than a fixed offset so each + # value is resolved against the DST rules for ITS OWN date — a frozen + # ``datetime.now()`` offset shifted off-season inputs by an hour. + aware = parsed if parsed.tzinfo is not None else parsed.astimezone() + return aware.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _format_api_dates( + datetime_input: str | Sequence[str | None] | None, date: bool = False +) -> str | None: + """ + Formats date or datetime input(s) for use with an API. + + Handles single values or ranges, and converting to ISO 8601 or date-only + formats as needed. + + Parameters + ---------- + datetime_input : Union[str, List[Optional[str]], None] + A single date/datetime string or a list of one or two date/datetime + strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601 (with or + without ``Z``/numeric offset), or relative periods (e.g., "P7D" / + "PT36H"). Range endpoints may be ``None``/``NaN``/empty to denote a + half-bounded range. + date : bool, optional + If True, uses only the date portion ("YYYY-MM-DD"). If False (default), + returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + + Returns + ------- + Union[str, None] + - If input is a single value, returns the formatted date/datetime string + or None if parsing fails. + - If input is a list of two values, returns a date/datetime range string + separated by "/" (e.g., "YYYY-MM-DD/YYYY-MM-DD" or + "YYYY-MM-DDTHH:MM:SSZ/YYYY-MM-DDTHH:MM:SSZ"). + - Returns None if input is empty, all NA, or cannot be parsed. + + Raises + ------ + ValueError + If `datetime_input` contains more than two values. + + Notes + ----- + - A single blank/NA value returns None. In a two-value range, a blank/NA + endpoint is rendered as ``".."`` to denote an open bound (e.g. + ``"2024-01-01/.."``); the range is only None when *every* element is + blank/NA or any non-NA element fails to parse. + - Supports ISO 8601 durations such as "P7D" and "PT36H" and pre-formatted + intervals containing ``"/"``; both are passed through unchanged. + - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when + `date` is False. Inputs with an explicit offset (``Z`` or ``+HH:MM``) are + converted from that offset to UTC; naive inputs are interpreted in the + local time zone for backwards compatibility. + """ + if datetime_input is None: + return None + + # Convert single string to list for uniform processing + if isinstance(datetime_input, str): + datetime_input = [datetime_input] + elif isinstance(datetime_input, Mapping): + # `list(mapping)` returns keys, which silently accepts the wrong shape. + raise TypeError( + f"date input must be a string or sequence of strings, " + f"not {type(datetime_input).__name__}." + ) + elif not isinstance(datetime_input, (list, tuple)): + # Materialize any other iterable (pandas.Series, numpy.ndarray, + # generator, ...) so the len()/subscript operations below work. + datetime_input = list(datetime_input) + + # Check for null or all NA and return None + if all(pd.isna(dt) or dt == "" or dt is None for dt in datetime_input): + return None + + if len(datetime_input) > 2: + raise ValueError("datetime_input should only include 1-2 values") + + # Pass through duration ("P7D", "PT36H") and pre-formatted interval ("a/b") + # strings untouched. + if len(datetime_input) == 1 and isinstance(datetime_input[0], str): + single = datetime_input[0] + if _DURATION_RE.match(single) or "/" in single: + return single + + # element invalidates the range. + formatted: list[str] = [] + for dt in datetime_input: + one = _format_one(dt, date=date) + if one is None: + return None + formatted.append(one) + return "/".join(formatted) + + +def _cql2_param(args: dict[str, Any]) -> str: + """ + Convert query parameters to CQL2 JSON format for POST requests. + + Parameters + ---------- + args : Dict[str, Any] + Dictionary of query parameters to convert to CQL2 format. + + Returns + ------- + str + Compact JSON string representation of the CQL2 query. + + Notes + ----- + Serialized with the tightest separators (no indentation or + whitespace). The body counts against the server's ~8 KB request-size + limit and against :func:`chunking._request_bytes` when planning + chunks, so every saved byte fits more values per POST: compact + encoding roughly halves the per-value cost versus pretty-printing, + which roughly doubles how many monitoring-location ids fit in one + sub-request and so halves the chunk count for large id lists. + """ + filters = [] + for key, values in args.items(): + filters.append({"op": "in", "args": [{"property": key}, values]}) + + query = {"op": "and", "args": filters} + + return json.dumps(query, separators=(",", ":")) + + +def _default_headers() -> dict[str, str]: + """ + Generate default HTTP headers for API requests. + + Returns + ------- + dict + A dictionary containing default headers including 'Accept-Encoding', + 'Accept', 'User-Agent', and 'lang'. If the environment variable + 'API_USGS_PAT' is set, its value is included as the 'X-Api-Key' header. + """ + headers = { + "Accept-Encoding": "compress, gzip", + "Accept": "application/json", + "User-Agent": f"python-dataretrieval/{__version__}", + "lang": "en-US", + } + token = os.getenv("API_USGS_PAT") + if token: + headers["X-Api-Key"] = token + return headers + + +def _check_ogc_requests(endpoint: str, req_type: str = "queryables") -> dict[str, Any]: + """ + Sends an HTTP GET request to the specified OGC endpoint and request type, + returning the JSON response. + + Parameters + ---------- + endpoint : str + The OGC collection endpoint to query (e.g. the service/collection id). + req_type : str, optional + The type of request to make. Must be either "queryables" or "schema" + (default is "queryables"). + + Returns + ------- + dict + The JSON response from the OGC endpoint. + + Raises + ------ + ValueError + If req_type is not "queryables" or "schema". + DataRetrievalError + From :func:`_raise_for_non_200` on any non-200 (the typed subclass for + the status) — same typed contract as the main data path so callers can + use one ``except`` clause everywhere. + """ + if req_type not in ("queryables", "schema"): + raise ValueError(f"req_type must be 'queryables' or 'schema', got {req_type!r}") + url = f"{_ogc_base_url_var.get()}/collections/{endpoint}/{req_type}" + resp = _get(url, headers=_default_headers(), **HTTPX_DEFAULTS) + _raise_for_non_200(resp) + # ``Response.json`` is typed ``Any``; the OGC queryables/schema endpoints + # return a JSON object, and callers index it as a dict. + return cast("dict[str, Any]", resp.json()) + + +def _error_body(resp: httpx.Response) -> str: + """ + Build an informative error message from an HTTP response. + + Parameters + ---------- + resp : httpx.Response + The HTTP response object to extract the error message from. + + Returns + ------- + str + An error message string assembled per status code: + + * **429** — predefined message describing the rate-limit and pointing + at the API-token path; the response body is not consulted. + * **403** — predefined message describing the most common cause + (query exceeding server limits); the response body is not + consulted. + * **other statuses** — attempts ``resp.json()`` and renders + ``": . ."`` from the JSON error + envelope. If the body is not JSON (e.g. an HTML 502 from a + gateway), falls back to ``": . "`` with + the first 200 characters of ``resp.text``; an empty body + degrades to ``": ."``. + """ + status = resp.status_code + if status == 429: + return ( + "429: Too many requests made. Please obtain an API token " + "or try again later." + ) + elif status == 403: + return ( + "403: Query request denied. Possible reasons include " + "query exceeding server limits." + ) + try: + j_txt = resp.json() + except ValueError: + snippet = (resp.text or "").strip()[:200] + reason = resp.reason_phrase or "Error" + if snippet: + return f"{status}: {reason}. {snippet}" + return f"{status}: {reason}." + return ( + f"{status}: {j_txt.get('code', 'Unknown type')}. " + f"{j_txt.get('description', 'No description provided')}." + ) + + +def _parse_retry_after(value: str | None) -> float | None: + """ + Parse a USGS ``Retry-After`` header into seconds. + + Parameters + ---------- + value : str or None + The raw header value, or ``None`` if absent. + + Returns + ------- + float or None + Non-negative delta-seconds, clamped at zero. ``None`` when the + header is absent or unparseable; ``ChunkedCall`` treats + ``None`` as "fall back to my own retry policy". + + Notes + ----- + USGS sends ``Retry-After`` as integer delta-seconds (empirically + verified — e.g. ``Retry-After: 2619``). The HTTP spec also allows + HTTP-date form, but USGS doesn't use it, so this function doesn't + bother parsing it. + """ + if not value: + return None + try: + return max(0.0, float(value.strip())) + except ValueError: + return None + + +def _raise_for_non_200(resp: httpx.Response) -> None: + """ + Raise a typed exception for any non-200 response. + + Routes through :func:`_error_body` (USGS-API-aware: handles + 429/403 specially, extracts ``code``/``description`` from JSON + error bodies) rather than ``Response.raise_for_status``, which + raises ``HTTPStatusError`` with a generic message. + + Parameters + ---------- + resp : httpx.Response + The HTTP response to inspect. + + Raises + ------ + DataRetrievalError + The typed subclass for the status (see + :func:`dataretrieval.exceptions.error_for_status` for the mapping). The + transient types (:class:`~dataretrieval.exceptions.TransientError`) are + distinguished so ``ChunkedCall`` can wrap them as a resumable + :class:`~dataretrieval.ogc.chunking.QuotaExhausted` / + :class:`~dataretrieval.ogc.chunking.ServiceInterrupted`; a fatal + :class:`~dataretrieval.exceptions.HTTPError` (not a ``TransientError``) + the chunker won't resume. + """ + status = resp.status_code + if status < 400: + return + raise error_for_status( + status, + _error_body(resp), + retry_after=_parse_retry_after(resp.headers.get("Retry-After")), + ) + + +def _paginated_failure_message(pages_collected: int, cause: BaseException) -> str: + """ + Build a user-facing message for a mid-pagination failure. + + The API exposes no resume cursor, so the caller's only recovery is + to retry the whole call — the message lists the practical knobs, + tailored to whether the failure was rate-limit (429) or something + else. + + Parameters + ---------- + pages_collected : int + Number of pages successfully fetched before the failure. + cause : BaseException + The underlying exception that interrupted pagination. + + Returns + ------- + str + A message suitable for the ``DataRetrievalError`` that the + paginated fetch paths raise from the original exception. + """ + cause_str = str(cause).removesuffix(".") + # Some ``httpx`` exceptions (e.g. ``TimeoutException()`` with no args) + # stringify to empty; fall back to the class name so the + # returned message is always informative. + if not cause_str.strip(): + cause_str = type(cause).__name__ + if isinstance(cause, RateLimited): + action = "wait for the rate-limit window to reset and retry" + else: + action = "retry the request (possibly after a short backoff)" + return ( + f"Paginated request failed after collecting {pages_collected} " + f"page(s): {cause_str}. To recover: {action}, reduce the " + f"request size (e.g. fewer locations, a shorter time range, or " + f"a smaller ``limit``), or obtain an API token." + ) + + +def _ogc_query_params( + params: dict[str, Any], + *, + properties: list[str] | None, + bbox: list[float] | None, + limit: int | None, + skip_geometry: bool | None, +) -> dict[str, Any]: + """Add the shared OGC query knobs to ``params`` (mutated in place). + + Factors out the ``skipGeometry``/``limit``/``bbox``/``properties`` block + common to every OGC request so the typed getters + (:func:`_construct_api_requests`) and the generalized CQL2 path + (:func:`_construct_cql_request`) build identical URL parameters. + + ``skip_geometry=None`` leaves ``skipGeometry`` unset (the server defaults to + including geometry); the typed getters always pass a bool, so their behavior + is unchanged. + """ + if skip_geometry is not None: + params["skipGeometry"] = skip_geometry + params["limit"] = 50000 if limit is None or limit > 50000 else limit + # `len()` instead of truthiness: a numpy ndarray would raise on `if bbox:`. + if bbox is not None and len(bbox) > 0: + params["bbox"] = ",".join(map(str, bbox)) + if properties: + params["properties"] = ",".join(properties) + return params + + +def _construct_api_requests( + service: str, + properties: list[str] | None = None, + bbox: list[float] | None = None, + limit: int | None = None, + skip_geometry: bool = False, + **kwargs: Any, +) -> httpx.Request: + """ + Constructs an HTTP request object for the specified water data API service. + + For most services, list parameters are comma-joined and sent as a single + GET request (e.g. ``parameter_code=["00060","00010"]`` becomes + ``parameter_code=00060,00010`` in the URL). For services the active dialect + flags as CQL2-only (``dialect.cql2_services``, e.g. the Water Data API's + ``monitoring-locations``), a POST request with CQL2 JSON is used instead. + + Parameters + ---------- + service : str + The name of the API service to query (e.g., "daily"). + properties : Optional[List[str]], optional + List of property names to include in the request. + bbox : Optional[List[float]], optional + Bounding box coordinates as a list of floats. + limit : Optional[int], optional + Maximum number of results to return per request. + skip_geometry : bool, optional + Whether to exclude geometry from the response (default is False). + **kwargs + Additional query parameters, including date/time filters and other + API-specific options. + + Returns + ------- + httpx.Request + The constructed HTTP request object ready to be sent. + + Notes + ----- + - Date/time parameters are automatically formatted to ISO8601. + """ + service_url = f"{_ogc_base_url_var.get()}/collections/{service}/items" + dialect = _dialect_var.get() + + # Format date/time parameters to ISO8601 first — both routing paths need it. + for key in _DATE_RANGE_PARAMS: + if key in kwargs: + kwargs[key] = _format_api_dates( + kwargs[key], + date=(service in dialect.date_only_services and key != "last_modified"), + ) + + if service in dialect.cql2_services: + # POST with CQL2 JSON: multi-value params go in the request body. + # The date-range loop above has already collapsed any _DATE_RANGE_PARAMS + # value to a string, so the list/tuple check below cannot match them. + post_params = { + k: v + for k, v in kwargs.items() + if isinstance(v, (list, tuple)) and len(v) > 1 + } + params = {k: v for k, v in kwargs.items() if k not in post_params} + else: + # GET with comma-separated values: join list/tuple values into one string. + # Skip empty lists/tuples so they're omitted rather than emitted as a + # filterless ``¶m=`` (which the server reads as "match empty"). + post_params = {} + params = { + k: ",".join(str(x) for x in v) if isinstance(v, (list, tuple)) else v + for k, v in kwargs.items() + if not (isinstance(v, (list, tuple)) and len(v) == 0) + } + + _ogc_query_params( + params, + properties=properties, + bbox=bbox, + limit=limit, + skip_geometry=skip_geometry, + ) + + # Translate CQL filter Python names to the hyphenated URL parameter that + # the OGC API expects. The Python kwarg is `filter_lang` because hyphens + # aren't valid in Python identifiers. + if "filter_lang" in params: + params["filter-lang"] = params.pop("filter_lang") + + headers = _default_headers() + + if post_params: + headers["Content-Type"] = "application/query-cql-json" + return httpx.Request( + method="POST", + url=service_url, + headers=headers, + content=_cql2_param(post_params), + params=params, + ) + return httpx.Request( + method="GET", + url=service_url, + headers=headers, + params=params, + ) + + +def _construct_cql_request( + service: str, + cql_body: str, + *, + properties: list[str] | None = None, + bbox: list[float] | None = None, + limit: int | None = None, + skip_geometry: bool | None = None, +) -> httpx.Request: + """Build a POST/CQL2 request from a verbatim CQL2 body. + + The OGC-API counterpart to :func:`_construct_api_requests` for the + generalized :func:`~dataretrieval.waterdata.api.get_cql` path: the + caller supplies an already-serialized CQL2 JSON document (any predicate the + grammar allows), sent unchanged as the request body, while + ``properties``/``bbox``/``limit``/``skip_geometry`` go on the URL via the + shared :func:`_ogc_query_params` — so a generalized query and an equivalent + typed getter produce the same URL parameters. + + Parameters + ---------- + service : str + OGC collection name (e.g. ``"daily"``). + cql_body : str + Serialized CQL2 JSON document, sent as the POST body verbatim. + properties, bbox, limit, skip_geometry + See :func:`_ogc_query_params`. ``properties`` are wire-format + (``id``-translated) names. + + Returns + ------- + httpx.Request + A POST request with ``Content-Type: application/query-cql-json``. + """ + service_url = f"{_ogc_base_url_var.get()}/collections/{service}/items" + params = _ogc_query_params( + {}, + properties=properties, + bbox=bbox, + limit=limit, + skip_geometry=skip_geometry, + ) + headers = _default_headers() + headers["Content-Type"] = "application/query-cql-json" + return httpx.Request( + method="POST", + url=service_url, + headers=headers, + content=cql_body, + params=params, + ) + + +def _next_req_url( + resp: httpx.Response, *, body: dict[str, Any] | None = None +) -> str | None: + """ + Extracts the URL for the next page of results from an HTTP response from a + water data endpoint. + + Parameters + ---------- + resp : httpx.Response + The HTTP response object containing JSON data and headers. + body : dict, optional + Pre-parsed JSON body for ``resp``. When provided, skips the + ``resp.json()`` call — useful when the caller has already + decoded the body for its own use (avoids a second parse pass). + + Returns + ------- + Optional[str] + The URL for the next page of results if available, otherwise None. + + Notes + ----- + - Returns None when the response carries no features. + - Expects the response JSON to contain a "links" list with objects having + "rel" and "href" keys. + - Checks for the "next" relation in the "links" to determine the next URL. + """ + if body is None: + body = resp.json() + # Stop paging when the response carries no features. Key off ``features`` + # rather than ``numberReturned``: the main Water Data API reports + # ``numberReturned`` but the NGWMN OGC API omits it, so trusting it would + # refuse to follow a ``next`` link on a page that actually carries + # features (mirrors the same guard in :func:`_get_resp_data`). + if not (body.get("features") or []): + return None + for link in body.get("links", []): + if link.get("rel") != "next": + continue + href = link.get("href") + if not href: + return None + # Refuse to follow a next-page link to a different host — + # the request's headers/auth were minted for the original + # host and shouldn't leak to whatever a poisoned response + # body might supply. Guarded against mock-shaped ``resp.url`` + # attributes (tests sometimes set strings or ``MagicMock``) + # by falling open when host extraction isn't reliable. + next_host: str | None + cur_host: str | None + try: + next_host = httpx.URL(href).host + resp_url = ( + resp.url + if isinstance(resp.url, httpx.URL) + else httpx.URL(str(resp.url)) + ) + cur_host = resp_url.host + except (httpx.InvalidURL, TypeError): + next_host = cur_host = None + if next_host and cur_host and next_host != cur_host: + raise RuntimeError( + f"Refusing to follow cross-host next-page URL: " + f"{next_host} != {cur_host}" + ) + # ``href`` comes from the JSON ``links`` array (typed ``Any``); the + # ``not href`` guard above already excluded empty/None, and it is a + # URL string (passed to ``httpx.URL`` above). + return cast("str", href) + return None + + +def _empty_feature_frame(geopd: bool) -> pd.DataFrame: + """Empty result frame for a page that carries no features. + + Returns a ``GeoDataFrame`` when geopandas is available so a downstream + ``pd.concat([empty_page, geo_page])`` doesn't downgrade a geopandas + user's result to a plain ``DataFrame`` (stripping geometry/CRS). The + single home for this empty-page contract, shared by the feature-frame + builders that flatten GeoJSON pages. + """ + return gpd.GeoDataFrame() if geopd else pd.DataFrame() + + +def _attach_coordinates(df: pd.DataFrame, features: list[dict[str, Any]]) -> None: + """Attach a ``geometry`` column of raw coordinate lists (in place) when + any feature carries geometry. Shared by the non-geopandas GeoJSON + feature-frame builders. + """ + geoms = [(f.get("geometry") or {}).get("coordinates") for f in features] + if any(g is not None for g in geoms): + df["geometry"] = geoms + + +def _get_resp_data( + resp: httpx.Response, + geopd: bool, + *, + body: dict[str, Any] | None = None, +) -> pd.DataFrame: + """ + Extracts and normalizes data from an HTTP response containing GeoJSON features. + + Parameters + ---------- + resp : httpx.Response + The HTTP response object expected to contain a JSON body + with a "features" key. + geopd : bool + Indicates whether geopandas is installed and should be used to + handle geometries. + body : dict, optional + Pre-parsed JSON body for ``resp``. When provided, skips the + ``resp.json()`` call — useful when the caller has already + decoded the body for its own use (avoids a second parse pass). + + Returns + ------- + gpd.GeoDataFrame or pd.DataFrame + A ``GeoDataFrame`` when ``geopd`` is True; otherwise a plain + ``DataFrame`` carrying the feature properties plus an ``id`` + column (always present, possibly all-None) and a ``geometry`` + column (coordinates list) when at least one feature includes + geometry. Returns an empty ``DataFrame`` when no features are + returned. + + Notes + ----- + The non-geopandas branch builds the frame directly from each + feature's ``properties`` dict, plus the top-level ``id`` and + ``geometry.coordinates`` columns — the ``id`` column is always + added (so the downstream rename to the service-specific output id + works even on an all-None id), while the ``geometry`` column is + added only when at least one feature carries geometry. This skips + the GeoJSON envelope entirely, so + newly-added Feature-level fields (e.g. ``geometry.type`` after + USGS migrated to full GeoJSON geometry objects) can't leak into + the result frame; no reactive drop-list needs maintenance every + time the upstream schema grows. + """ + if body is None: + body = resp.json() + # Key the empty-result short-circuit off ``features`` rather than + # ``numberReturned``: the main Water Data API reports ``numberReturned``, + # but the NGWMN OGC API omits it, so trusting it would discard pages that + # actually carry features. An absent/empty ``features`` is also the real + # schema-drift shape (a 200 with no features) — treat it as empty rather + # than crash with a ``KeyError`` downstream, which ``_paginate`` would + # mistake for a transient transport error. ``_empty_feature_frame`` + # preserves the GeoDataFrame type on the short-circuit (see its docstring). + features = body.get("features") or [] + if not features: + return _empty_feature_frame(geopd) + + if not geopd: + df = pd.json_normalize([f.get("properties") or {} for f in features], sep="_") + # Always materialize the ``id`` column (may be all-None) so + # ``_arrange_cols``'s ``df.rename(columns={"id": output_id})`` + # produces the documented service-specific output_id column + # (daily_id, channel_measurements_id, …) even if the upstream + # response carried no feature-level id. + df["id"] = [f.get("id") for f in features] + _attach_coordinates(df, features) + return df + + # Organize json into geodataframe and make sure id column comes along. + # NGWMN observation collections (water levels, lithology, …) return + # features with no ``geometry`` key at all, which + # ``GeoDataFrame.from_features`` can't handle (it indexes + # ``feature["geometry"]`` directly). Default the key to ``None`` for only + # those features so the call is safe; the all-null check below then yields + # a plain DataFrame. Features that already carry geometry (the common + # sites case) are passed through without a per-feature dict copy. + df = gpd.GeoDataFrame.from_features( + [f if "geometry" in f else {**f, "geometry": None} for f in features] + ) + # Mirror the non-geopandas branch's defensive ``f.get("id")`` so a feature + # missing a top-level ``id`` yields None rather than a KeyError. + df["id"] = [f.get("id") for f in features] + df = df[["id"] + [col for col in df.columns if col != "id"]] + + # If no geometry present, then return pandas dataframe. A geodataframe + # is not needed. + if df["geometry"].isnull().all(): + df = pd.DataFrame(df.drop(columns="geometry")) + + return df + + +@asynccontextmanager +async def _client_for( + client: httpx.AsyncClient | None, +) -> AsyncIterator[httpx.AsyncClient]: + """ + Yield a usable async client, picking the best available source. + + Resolution order: + + 1. ``client`` if the caller supplied one (borrowed; not closed + here — the caller owns its lifecycle). + 2. The chunker's shared async client if we're inside a + :class:`~dataretrieval.ogc.chunking.ChunkedCall` run (per + :func:`chunking.get_active_client`). Borrowed; the chunker + closes it on exit. + 3. A fresh short-lived ``httpx.AsyncClient`` opened here and closed + on context exit. + + Parameters + ---------- + client : httpx.AsyncClient or None + A caller-owned client to borrow, or ``None`` to defer to the + chunker's shared client or a temporary one. + + Yields + ------ + httpx.AsyncClient + The chosen client. + """ + if client is not None: + yield client + return + shared = get_active_client() + if shared is not None: + yield shared + return + async with httpx.AsyncClient(**HTTPX_DEFAULTS) as new: + yield new + + +def _aggregate_paginated_response( + initial: httpx.Response, + last: httpx.Response, + total_elapsed: timedelta, +) -> httpx.Response: + """ + Build a single response covering a paginated call. + + Returns a shallow copy of ``initial`` with ``.headers`` set to the + LAST page's (so downstream sees current ``x-ratelimit-remaining``) + and ``.elapsed`` set to total wall-clock. The canonical + ``initial.url`` is preserved (it's the user's original query). + Both ``initial`` and ``last`` are left unmutated, mirroring the + convention of + :func:`dataretrieval.ogc.chunking._combine_chunk_responses`. + + Parameters + ---------- + initial : httpx.Response + First-page response (the canonical one for ``md.url``). + last : httpx.Response + Last-page response — supplies the headers to copy over. + total_elapsed : datetime.timedelta + Cumulative wall-clock across every page, including ``initial``. + + Returns + ------- + httpx.Response + A shallow copy of ``initial`` with ``.headers`` set to a fresh + ``httpx.Headers`` and ``.elapsed`` set to the cumulative + wall-clock. ``initial.headers`` / ``initial.elapsed`` are + never mutated, so callers holding a pre-pagination reference + still see the original first-page values. + """ + final = copy.copy(initial) + final.headers = httpx.Headers(last.headers) + final.elapsed = total_elapsed + return final + + +_Cursor = TypeVar("_Cursor") + +# Optional cap on the total rows a single paginated call accumulates before it +# stops following ``next`` links. ``None`` (the default the data getters use) +# means "no cap — fetch the whole series". Set via :func:`_row_cap` so the deep +# ``_paginate`` loop can honor it without threading the value through the +# generic chunker; this mirrors the ``_progress`` ambient-reporter pattern. +_row_cap_var: ContextVar[int | None] = ContextVar("ogc_row_cap", default=None) + + +@contextmanager +def _row_cap(max_rows: int | None) -> Iterator[None]: + """Cap the rows any :func:`_paginate` under this context will + accumulate (``None`` = uncapped). Used by :func:`get_reference_table` + to preview large tables without downloading every page.""" + token = _row_cap_var.set(max_rows) + try: + yield + finally: + _row_cap_var.reset(token) + + +# OGC base URL for the active request. ``get_ogc_data`` sets it per call so the +# shared request builder (:func:`_construct_api_requests`) can target either the +# main Water Data API or the NGWMN sub-API without threading the value through +# the generic chunker; this mirrors the ``_row_cap`` ambient pattern. The +# default is the main API, so every existing getter is unaffected. +_ogc_base_url_var: ContextVar[str] = ContextVar("ogc_base_url", default=OGC_API_URL) + + +@contextmanager +def _ogc_base_url(base_url: str) -> Iterator[None]: + """Point :func:`_construct_api_requests` (and the chunk planner that calls + it) at ``base_url`` for the duration of the block. Used by + :func:`get_ogc_data` to serve NGWMN collections from their own OGC base.""" + token = _ogc_base_url_var.set(base_url) + try: + yield + finally: + _ogc_base_url_var.reset(token) + + +# Per-call OGC dialect (which services need POST/CQL2, which use date-only time +# args). ``get_ogc_data`` sets it so the shared request builder +# (:func:`_construct_api_requests`) can adapt to the active API without +# threading the value through the generic chunker; this mirrors the +# ``_ogc_base_url`` ambient pattern. The default is a plain OGC API. +_dialect_var: ContextVar[OgcDialect] = ContextVar( + "ogc_dialect", default=_DEFAULT_DIALECT +) + + +@contextmanager +def _dialect(dialect: OgcDialect) -> Iterator[None]: + """Make ``dialect`` the active :class:`OgcDialect` that + :func:`_construct_api_requests` reads for CQL2-vs-GET routing and + date-only formatting, for the duration of the block.""" + token = _dialect_var.set(dialect) + try: + yield + finally: + _dialect_var.reset(token) + + +async def _paginate( + initial_req: httpx.Request, + *, + parse_response: Callable[[httpx.Response], tuple[pd.DataFrame, _Cursor | None]], + follow_up: Callable[[_Cursor, httpx.AsyncClient], Awaitable[httpx.Response]], + client: httpx.AsyncClient | None = None, +) -> tuple[pd.DataFrame, httpx.Response]: + """ + Drive a paginated request to completion over an + :class:`httpx.AsyncClient`. + + The common shape behind the paginated fetch paths (e.g. + :func:`_walk_pages`): send the initial request, then loop calling + ``follow_up`` until ``parse_response`` reports a ``None`` cursor, + accumulating frames and elapsed time. Any mid-pagination failure + raises ``DataRetrievalError`` wrapping the cause — the API exposes no + resume cursor, so the caller's only recovery is to retry the whole + call. Issuing HTTP asynchronously lets the multiple sub-requests of a + chunked call run concurrently under + :meth:`~dataretrieval.ogc.chunking.ChunkedCall._run`. + + Parameters + ---------- + initial_req : httpx.Request + First-page request to send. + parse_response : callable + ``resp -> (df, next_cursor_or_None)``. Returns the page's + DataFrame and the cursor (URL, token, …) used to drive + ``follow_up`` for the next page; ``None`` terminates the loop. + follow_up : callable + ``(cursor, client) -> Awaitable[httpx.Response]``. Builds and + sends the next-page request. + client : httpx.AsyncClient, optional + Caller-borrowed client. ``None`` (default) means use the + chunker's shared client (if inside a chunked call) or open + a temporary one. + + Returns + ------- + df : pandas.DataFrame + Concatenation of every page's parsed frame. + response : httpx.Response + A shallow copy of the first-page response, with ``.headers`` + rebuilt as a fresh ``httpx.Headers`` reflecting the last page and + ``.elapsed`` set to cumulative wall-clock. The canonical URL is + preserved from the first page. The original first-page response + is not mutated. + + Raises + ------ + DataRetrievalError + On a non-200 initial response, the typed subclass for the status from + :func:`_raise_for_non_200` (a + :class:`~dataretrieval.exceptions.TransientError` for a retryable + 429 / 5xx, otherwise a fatal :class:`~dataretrieval.exceptions.HTTPError`); + or, on an initial-page parse failure or any subsequent-page failure, a + base ``DataRetrievalError`` wrapping the cause (built by + :func:`_paginated_failure_message`, original exception on ``__cause__``). + httpx.HTTPError + Network-level failures on the *initial* request (e.g. + ``ConnectError``, ``TimeoutException``) propagate unmodified + so callers can branch on the specific type; equivalent + failures on subsequent pages are wrapped per above. + """ + logger.debug("Requesting: %s", initial_req.url) + reporter = _progress.current() + async with _client_for(client) as sess: + resp = await sess.send(initial_req) + _raise_for_non_200(resp) + initial_response = resp + total_elapsed = _safe_elapsed(resp) + + try: + df, cursor = parse_response(resp) + except Exception as e: # noqa: BLE001 + # Initial-page parse failures (malformed JSON, missing + # ``features``, schema drift) get the same wrapped-message + # treatment as follow-up failures so callers see a consistent + # diagnostic regardless of which page broke. + logger.warning("Initial response parse failed.") + raise DataRetrievalError(_paginated_failure_message(0, e)) from e + dfs = [df] + # Stop following ``next`` links once the optional row cap is reached + # (see :func:`_row_cap`); ``None`` means uncapped. The concatenation + # is sliced to the cap below so a final over-budget page can't exceed it. + cap = _row_cap_var.get() + nrows = len(df) + if reporter is not None: + reporter.set_rate_remaining( + resp.headers.get(_QUOTA_HEADER), + limit=resp.headers.get("x-ratelimit-limit"), + ) + reporter.add_page(rows=len(df)) + while cursor is not None and (cap is None or nrows < cap): + try: + resp = await follow_up(cursor, sess) + _raise_for_non_200(resp) + df, cursor = parse_response(resp) + dfs.append(df) + nrows += len(df) + total_elapsed += _safe_elapsed(resp) + if reporter is not None: + reporter.set_rate_remaining( + resp.headers.get(_QUOTA_HEADER), + limit=resp.headers.get("x-ratelimit-limit"), + ) + reporter.add_page(rows=len(df)) + except Exception as e: # noqa: BLE001 + logger.warning( + "Request failed at cursor %r. Data download interrupted.", + cursor, + ) + raise DataRetrievalError(_paginated_failure_message(len(dfs), e)) from e + + # Aggregate headers / elapsed onto a COPY of the initial + # response so the user's caller never sees an in-place + # mutation of the response object they may have inspected + # mid-pagination via a hook or test fixture. + final_response = _aggregate_paginated_response( + initial_response, resp, total_elapsed + ) + result = pd.concat(dfs, ignore_index=True) + if cap is not None: + result = result.head(cap) + return result, final_response + + +def _ogc_parse_response( + resp: httpx.Response, *, geopd: bool +) -> tuple[pd.DataFrame, str | None]: + """Parse one OGC API page: extract the DataFrame and the next-page URL. + + The parse strategy :func:`_walk_pages` hands to + :func:`_paginate`. Coerces falsy cursors (empty href, etc.) to + ``None`` so the paginate loop's ``while cursor is not None`` + terminates instead of spinning on a meaningless value. + """ + body = resp.json() + return ( + _get_resp_data(resp, geopd=geopd, body=body), + _next_req_url(resp, body=body) or None, + ) + + +async def _walk_pages( + geopd: bool, + req: httpx.Request, + client: httpx.AsyncClient | None = None, +) -> tuple[pd.DataFrame, httpx.Response]: + """ + Iterate paginated OGC API responses asynchronously and aggregate + them into one DataFrame. + + Thin wrapper that hands off to :func:`_paginate` with + OGC-specific strategies: pages are parsed via :func:`_get_resp_data` + (through :func:`_ogc_parse_response`) and the next-page cursor is the + URL from the response's ``links`` array (per :func:`_next_req_url`). + + Parameters + ---------- + geopd : bool + Whether geopandas is installed (drives geometry handling). + req : httpx.Request + The initial HTTP request to send. + client : httpx.AsyncClient, optional + Caller-borrowed client; ``None`` defers client management to + :func:`_paginate`. + + Returns + ------- + pd.DataFrame + A DataFrame containing the aggregated results from all pages. + httpx.Response + Aggregated response — initial-request URL (for query identity), + final page's headers (so downstream sees current rate-limit + state), and cumulative ``elapsed`` summed across pages. + + Raises + ------ + DataRetrievalError + See :func:`_paginate`. + httpx.HTTPError + See :func:`_paginate`. + """ + method = req.method # ``httpx.Request.method`` is already upper-cased. + headers = req.headers + content = req.content if method == "POST" else None + + async def follow_up(cursor: str, sess: httpx.AsyncClient) -> httpx.Response: + return await sess.request(method, cursor, headers=headers, content=content) + + return await _paginate( + req, + parse_response=functools.partial(_ogc_parse_response, geopd=geopd), + follow_up=follow_up, + client=client, + ) + + +def _deal_with_empty( + return_list: pd.DataFrame, properties: list[str] | None, service: str +) -> pd.DataFrame: + """ + Handles empty DataFrame results by returning a DataFrame with appropriate columns. + + If `return_list` is empty, determines the column names to use: + - If `properties` is not provided or contains only NaN values, + retrieves schema properties from the specified service. + - Otherwise, uses the provided `properties` list as column names. + + Parameters + ---------- + return_list : pd.DataFrame + The DataFrame to check for emptiness. + properties : Optional[List[str]] + List of property names to use as columns, or None. + service : str + The service endpoint to query for schema properties if needed. + + Returns + ------- + pd.DataFrame + The original DataFrame if not empty, otherwise an empty + DataFrame with the appropriate columns. + """ + if return_list.empty: + if not properties or all(pd.isna(properties)): + schema = _check_ogc_requests(endpoint=service, req_type="schema") + properties = list(schema.get("properties", {}).keys()) + return pd.DataFrame(columns=properties) + return return_list + + +def _arrange_cols( + df: pd.DataFrame, + properties: list[str] | None, + output_id: str, + extra_id_cols: frozenset[str] | set[str] = frozenset(), +) -> pd.DataFrame: + """ + Rearranges and renames columns in a DataFrame based on provided + properties and the service output id. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame whose columns are to be rearranged or renamed. + properties : Optional[List[str]] + A list of column names to possibly rename. If None or contains + only NaN, the function renames 'id' to output_id. + output_id : str + The name to which the 'id' column should be renamed if applicable. + extra_id_cols : set or frozenset, optional + Synthetic, meaningless-to-user id columns to move to the end of the + result frame when the wire ``id`` is returned (i.e. ``properties`` was + not specified). Defaults to an empty set (no reordering). + + Returns + ------- + pd.DataFrame or gpd.GeoDataFrame + The DataFrame with columns rearranged and/or renamed according + to the specified properties and output_id. + """ + + # Rename id column to output_id + df = df.rename(columns={"id": output_id}) + + if properties and not all(pd.isna(properties)): + # Don't alias the caller's list — we mutate below. + local_properties = list(properties) + if "geometry" in df.columns and "geometry" not in local_properties: + local_properties.append("geometry") + # 'id' is a valid service column, but expose it under the + # service-specific output_id name instead. + if "id" in local_properties: + local_properties[local_properties.index("id")] = output_id + df = df.loc[:, [col for col in local_properties if col in df.columns]] + + # Move meaningless-to-user, extra id columns to the end + # of the dataframe, if they exist + extra_id_col = set(df.columns).intersection(extra_id_cols) + + # If the arbitrary id column is returned (either due to properties + # being none or NaN), then move it to the end of the dataframe, but + # if part of properties, keep in requested order + if extra_id_col and (properties is None or all(pd.isna(properties))): + id_col_order = [col for col in df.columns if col not in extra_id_col] + list( + extra_id_col + ) + df = df.loc[:, id_col_order] + + return df + + +def _type_cols(df: pd.DataFrame, dialect: OgcDialect) -> pd.DataFrame: + """ + Casts columns into appropriate types per the API ``dialect``. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame. + dialect : OgcDialect + Supplies ``time_cols`` / ``numerical_cols`` — which columns to + coerce to datetime/numeric. The engine itself holds no + API-specific column knowledge. + + Returns + ------- + pd.DataFrame + The DataFrame with columns cast to appropriate types. + + """ + cols = set(df.columns) + for col in cols.intersection(dialect.time_cols): + df[col] = pd.to_datetime(df[col], errors="coerce") + + for col in cols.intersection(dialect.numerical_cols): + df[col] = pd.to_numeric(df[col], errors="coerce") + + return df + + +def _sort_rows(df: pd.DataFrame, dialect: OgcDialect) -> pd.DataFrame: + """ + Sorts rows by the API ``dialect``'s ``sort_cols`` (in priority order). + + Sorting is applied only when the primary (first) sort column is + present; any later sort columns also present become secondary keys. + This mirrors the historical Water Data behavior (sort by ``time``, + then ``monitoring_location_id``) while letting other APIs key off + their own columns (e.g. NGWMN's ``sample_time``). + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame. + dialect : OgcDialect + Supplies ``sort_cols``. + + Returns + ------- + pd.DataFrame + The DataFrame with rows ordered per the dialect. + + """ + if not dialect.sort_cols or dialect.sort_cols[0] not in df.columns: + return df + present = [c for c in dialect.sort_cols if c in df.columns] + return df.sort_values(by=present, ignore_index=True) + + +# Matches a lowercase letter or digit immediately followed by an uppercase +# letter — the camelCase/PascalCase word boundary where a ``_`` is inserted. +# A letter/digit boundary is intentionally NOT split (so ``navd88`` stays put). +_CAMEL_BOUNDARY_RE = re.compile(r"([a-z0-9])([A-Z])") + + +def _to_snake_case(name: str) -> str: + """Convert a camelCase/PascalCase column name to snake_case. + + Inserts an underscore only at a lowercase-or-digit followed by an + uppercase boundary, then lowercases the whole string. Names that are + already snake_case or all-lowercase are returned unchanged; runs of + capitals (e.g. ``someXMLField``) are handled best-effort. + + Examples + -------- + >>> _to_snake_case("waterLevelObs") + 'water_level_obs' + >>> _to_snake_case("monitoring_location_id") + 'monitoring_location_id' + >>> _to_snake_case("navd88") + 'navd88' + """ + return _CAMEL_BOUNDARY_RE.sub(r"\1_\2", name).lower() + + +def _finalize_ogc( + frame: pd.DataFrame, + response: httpx.Response, + *, + properties: list[str] | None, + output_id: str, + convert_type: bool, + service: str, + max_rows: int | None = None, + extra_id_cols: frozenset[str] | set[str] = frozenset(), + dialect: OgcDialect = _DEFAULT_DIALECT, +) -> tuple[pd.DataFrame, BaseMetadata]: + """Shape a combined OGC result into the user-facing ``(df, md)``. + + The single home for the OGC getters' result shaping: empties + normalized, column names normalized to snake_case, types coerced (when + ``convert_type``), the wire ``id`` renamed and columns ordered, rows + sorted, optionally truncated to ``max_rows``, and the response wrapped + as :class:`~dataretrieval.utils.BaseMetadata`. + + Injected into the chunker as its ``finalize`` hook (see + :data:`~dataretrieval.ogc.chunking._Finalize`) so the + un-interrupted return *and* a resumed ``ChunkInterrupted.call.resume()`` + produce the same post-processed ``(DataFrame, BaseMetadata)`` shape, not + the chunker's raw frame and bare ``httpx.Response``. + + ``max_rows`` is applied here (after dedup/sort, on the *combined* frame) + rather than only per-sub-request, so a chunked call's total is bounded + to exactly ``max_rows`` and a resumed call honors the cap too — the + per-``_paginate`` ``_row_cap`` is only an early-stop download bound. + """ + frame = _deal_with_empty(frame, properties, service) + # Normalize to PEP-8 snake_case column names *first*, so the dialect's + # ``time_cols``/``numerical_cols``/``sort_cols`` (all snake_case) match + # regardless of whether the API returns snake_case (Water Data, where + # this is a no-op) or camelCase (a sibling OGC API). Doing it before + # type coercion is what makes ``convert_type`` reach a camelCase field. + renames = { + col: snake + for col in frame.columns + if isinstance(col, str) and (snake := _to_snake_case(col)) != col + } + if renames: + frame = frame.rename(columns=renames) + if convert_type: + frame = _type_cols(frame, dialect) + frame = _arrange_cols(frame, properties, output_id, extra_id_cols) + frame = _sort_rows(frame, dialect) + if max_rows is not None: + frame = frame.head(max_rows) + return frame, BaseMetadata(response) + + +def get_ogc_data( + args: dict[str, Any], + service: str, + output_id: str, + *, + max_rows: int | None = None, + base_url: str = OGC_API_URL, + extra_id_cols: frozenset[str] | set[str] = frozenset(), + dialect: OgcDialect | None = None, +) -> tuple[pd.DataFrame, BaseMetadata]: + """ + Retrieves OGC (Open Geospatial Consortium) data from a specified + endpoint and returns it as a pandas DataFrame with metadata. + + This function prepares request arguments, constructs API requests, + handles pagination, processes the results, and formats output + according to the specified parameters. + + Parameters + ---------- + args : Dict[str, Any] + Dictionary of request arguments for the OGC service. + service : str + The OGC API collection name (e.g., ``"daily"``, + ``"monitoring-locations"``, ``"continuous"``). + output_id : str + The user-facing id column the wire ``id`` is renamed to. Required — + the per-API service-to-id map lives in the caller, not here. + max_rows : int, optional + Stop paginating once this many rows have been collected and + truncate the result to exactly ``max_rows``. ``None`` (default) + fetches the full result. Intended for cheap previews of large, + un-chunked tables (e.g. :func:`get_reference_table`). + base_url : str, optional + OGC API base URL to target. Defaults to the main Water Data API. + extra_id_cols : set or frozenset, optional + Synthetic id columns to push to the end of a result frame (see + :func:`_arrange_cols`). Defaults to an empty set. + dialect : OgcDialect, optional + Per-API request quirks (CQL2-only services, date-only services). + Defaults to a plain OGC API with neither. + + Returns + ------- + pd.DataFrame or gpd.GeoDataFrame + A DataFrame containing the retrieved and processed OGC data. + BaseMetadata + A metadata object containing request information including URL and query time. + + Notes + ----- + - The function does not mutate the input `args` dictionary. + - Handles optional arguments such as `convert_type`. + - Applies column cleanup and reordering based on service and properties. + """ + # Enforce a genuine positive integer: a float (even ``10.0``) or ``bool`` + # would pass a bare ``< 1`` check and then crash deep in + # ``pd.DataFrame.head`` with an opaque ``TypeError`` after HTTP I/O has + # already fired. ``numbers.Integral`` (not ``int``) so numpy integers — + # e.g. ``max_rows`` derived from a numpy/pandas computation — are accepted; + # ``bool`` is an ``Integral`` subtype, so exclude it explicitly. + if max_rows is not None and ( + not isinstance(max_rows, numbers.Integral) + or isinstance(max_rows, bool) + or max_rows < 1 + ): + raise ValueError(f"max_rows must be a positive integer (got {max_rows!r}).") + + if dialect is None: + dialect = _DEFAULT_DIALECT + + args = args.copy() + args["service"] = service + args = _switch_arg_id(args, id_name=output_id, service=service) + # Capture `properties` before the id-switch so post-processing sees + # the user-facing names, not the wire-format ones. + properties = args.get("properties") + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) + convert_type = args.pop("convert_type", False) + args = {k: v for k, v in args.items() if v is not None} + + # Post-processing is injected into the chunker rather than applied here, + # so it runs on *every* exit: the normal return AND a later + # ``exc.call.resume()`` after a ChunkInterrupted (which never re-enters + # this function). ``_finalize_ogc`` is the single source of result shape; + # it also applies ``max_rows`` to the *combined* frame so the cap is the + # exact total even when the plan chunks or the call is resumed, while + # ``_row_cap`` below only early-stops each sub-request's pagination. + finalize = functools.partial( + _finalize_ogc, + properties=properties, + output_id=output_id, + convert_type=convert_type, + service=service, + max_rows=max_rows, + extra_id_cols=extra_id_cols, + dialect=dialect, + ) + with _progress.progress_context(service=service), _row_cap(max_rows): + with _ogc_base_url(base_url), _dialect(dialect): + return _fetch_once(args, finalize=finalize) + + +@chunking.multi_value_chunked(build_request=_construct_api_requests) +async def _fetch_once( + args: dict[str, Any], +) -> tuple[pd.DataFrame, httpx.Response]: + """Send one prepared-args OGC request asynchronously; return the + frame + response. + + ``@chunking.multi_value_chunked`` models every multi-value list + parameter and the cql-text filter as a chunkable axis, greedy-halves + the biggest chunk across all axes until each sub-request URL fits, + and iterates the cartesian product. With no chunkable inputs the + decorator passes args through unchanged. The decorator gathers every + sub-request over one shared :class:`httpx.AsyncClient` (concurrency + bounded by a semaphore, sized from ``API_USGS_CONCURRENT``) + and returns a *synchronous* wrapper, so ``get_ogc_data`` keeps calling + ``_fetch_once(args, finalize=...)`` synchronously. The return shape is + ``(frame, response)``. + """ + req = _construct_api_requests(**args) + return await _walk_pages(geopd=GEOPANDAS, req=req) + + +def _run_sync( + make_coro: Callable[[], Awaitable[tuple[pd.DataFrame, httpx.Response]]], + *, + service: str, +) -> tuple[pd.DataFrame, httpx.Response]: + """Drive an async OGC fetch to completion from synchronous code. + + Opens the service progress context and runs ``make_coro()`` through a + short-lived ``anyio`` blocking portal (a worker thread), so the + non-chunked getters work whether or not the caller is already inside an + event loop (Jupyter/async apps). The portal copies the calling context, + so the active progress reporter still reaches the sub-requests. + + Shared by the non-chunked fetch paths; the chunked OGC getters + drive their own portal + inside :meth:`chunking.ChunkedCall.resume`. + """ + with _progress.progress_context(service=service): + with start_blocking_portal() as portal: + try: + return portal.call(make_coro) + except httpx.TransportError as exc: + # The initial-request connection failure ``_paginate`` lets + # through raw; mid-pagination failures are already typed. + # Report the base URL actually targeted (NGWMN/sibling APIs + # set their own via ``_ogc_base_url``), not a hardcoded host. + raise _network_error(_ogc_base_url_var.get(), exc) from exc + + +# ``AGENCY-ID``: a hyphen-separated agency prefix and local id. The local id +# may itself contain hyphens (``\S+`` after the first separator) — NGWMN +# aggregates many non-USGS agencies whose local ids aren't bare digits, so +# only the agency prefix is constrained to be hyphen/space-free. +_MONITORING_LOCATION_ID_RE = re.compile(r"[^-\s]+-\S+") + +# Default set of iterable-shaped params that ``_get_args`` must NOT push +# through ``_normalize_str_iterable`` (date-range params may carry +# ``pd.NaT``/None or interval strings; ``bbox`` is ``list[float]``). Callers +# with extra numeric params (e.g. the Water Data API's ``water_year``, +# ``thresholds``) pass their own superset. +_NO_NORMALIZE_PARAMS = _DATE_RANGE_PARAMS | {"bbox"} + + +def _normalize_str_iterable( + value: str | Iterable[str] | None, + param_name: str = "value", +) -> str | list[str] | None: + """Validate that ``value`` is None, a string, or an iterable of strings. + + Non-string iterables (``list``, ``tuple``, ``pandas.Series``, + ``pandas.Index``, ``numpy.ndarray``, generators) are materialized to a + ``list`` so downstream code that branches on ``isinstance(v, (list, + tuple))`` keeps working. ``Mapping`` types are rejected because + iterating a mapping yields keys, not values. + + Parameters + ---------- + value : None, str, or iterable of str + param_name : str, optional + Used in error messages. Defaults to ``"value"``. + + Returns + ------- + None, str, or list of str + + Raises + ------ + TypeError + If the input isn't ``None``, ``str``, or a non-``Mapping`` + iterable; or if any iterable element isn't a string. + """ + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, Mapping) or not isinstance(value, Iterable): + raise TypeError( + f"{param_name} must be a string or iterable of strings, " + f"not {type(value).__name__} (got {value!r})." + ) + values: list[str] = [] + for v in value: + if not isinstance(v, str): + raise TypeError( + f"{param_name} elements must be strings, " + f"not {type(v).__name__} (got {v!r})." + ) + values.append(v) + return values + + +def _as_str_list( + value: str | Iterable[str] | None, + param_name: str = "value", +) -> list[str] | None: + """Normalize ``value`` to ``list[str]`` (``None`` passes through). + + Wraps a bare ``str`` in a single-element list — so a later + ``",".join(...)`` doesn't iterate it character-by-character — and + materializes any other iterable via :func:`_normalize_str_iterable`. + """ + normalized = _normalize_str_iterable(value, param_name) + if isinstance(normalized, str): + return [normalized] + return normalized + + +def _check_monitoring_location_id( + monitoring_location_id: str | Iterable[str] | None, +) -> str | list[str] | None: + """Validate and normalize a ``monitoring_location_id`` value. + + Combines :func:`_normalize_str_iterable` with the AGENCY-ID format + check that is unique to ``monitoring_location_id`` (the OGC spec + requires a hyphen separator, e.g. ``USGS-01646500``). + + Parameters + ---------- + monitoring_location_id : None, str, or iterable of str + See :func:`_normalize_str_iterable`. Each string is additionally + required to match the AGENCY-ID hyphen-separated format. + + Returns + ------- + None, str, or list of str + + Raises + ------ + TypeError + If the input isn't ``None``, ``str``, or a non-``Mapping`` + iterable; or if any iterable element isn't a string. + ValueError + If any identifier doesn't contain a hyphen separator + (per the OGC API spec: AGENCY-ID format, e.g. ``USGS-01646500``). + """ + try: + value = _normalize_str_iterable( + monitoring_location_id, "monitoring_location_id" + ) + except TypeError as exc: + # Re-raise with the AGENCY-ID hint the generic helper doesn't carry. + raise TypeError( + f"{exc} Expected 'AGENCY-ID' format, e.g., 'USGS-01646500'." + ) from None + if value is None: + return None + for item in (value,) if isinstance(value, str) else value: + _check_id_format(item) + return value + + +def _check_id_format(value: str) -> None: + """Raise ``ValueError`` if ``value`` is not in ``AGENCY-ID`` format.""" + if not _MONITORING_LOCATION_ID_RE.fullmatch(value): + raise ValueError( + f"Invalid monitoring_location_id: {value!r}. " + f"Expected 'AGENCY-ID' format, e.g., 'USGS-01646500'." + ) + + +def _get_args( + local_vars: dict[str, Any], + exclude: set[str] | None = None, + *, + no_normalize: frozenset[str] | set[str] = _NO_NORMALIZE_PARAMS, +) -> dict[str, Any]: + """ + Build the API-request kwargs dict from a getter's ``locals()``. + + Drops bookkeeping keys (``service``, ``output_id``, anything in + ``exclude``) and ``None``-valued kwargs, then normalizes the + remaining values: + + - ``monitoring_location_id`` is validated against the AGENCY-ID + format (per :func:`_check_monitoring_location_id`). + - ``properties`` is materialized to ``list[str]`` (a bare string + gets wrapped in a single-element list so downstream + ``",".join(properties)`` doesn't iterate per character). + - A non-string iterable in ``no_normalize`` (numeric params + such as ``water_year``, ``bbox``, ``thresholds``) is materialized + to a ``list`` with its element types preserved (no string + normalization), so the GET comma-join and the chunker — which test + ``list``/``tuple`` — handle it instead of ``str()``-ing the whole + array. + - Any other ``Iterable[str]`` (i.e. not in ``no_normalize``) + is materialized to ``list[str]`` via + :func:`_normalize_str_iterable` so downstream code that branches + on ``isinstance(v, (list, tuple))`` works for ``pandas.Series``, + ``numpy.ndarray``, generators, etc. + - Scalars and strings pass through unchanged. + + Parameters + ---------- + local_vars : dict[str, Any] + Dictionary of local variables, typically from ``locals()``. + exclude : set[str], optional + Additional keys to exclude from the resulting dictionary. + no_normalize : set[str], optional + Iterable-shaped params whose element types must be preserved + (no string normalization). Defaults to the generic date-range + + ``bbox`` set; callers with extra numeric params pass a superset. + + Returns + ------- + dict[str, Any] + Filtered and normalized arguments for API requests. + """ + to_exclude = {"service", "output_id"} + if exclude: + to_exclude.update(exclude) + + args: dict[str, Any] = {} + for k, v in local_vars.items(): + if k in to_exclude or v is None: + continue + if k == "monitoring_location_id": + args[k] = _check_monitoring_location_id(v) + elif k == "properties": + args[k] = _as_str_list(v, k) + elif k in no_normalize and isinstance(v, Iterable) and not isinstance(v, str): + # Numeric params (water_year, bbox, thresholds, …) keep their + # element types — no string-normalization — but a non-string + # iterable (numpy array, pandas Series, generator) is materialized + # to a list so the GET comma-join and the chunker, which test + # ``list``/``tuple``, handle it instead of str()-ing the whole + # array. ``.tolist()`` yields native int/float; ``list()`` covers + # generators and other iterables. Scalars/strings fall through. + args[k] = v.tolist() if hasattr(v, "tolist") else list(v) + elif isinstance(v, str) or not isinstance(v, Iterable): + args[k] = v + else: + args[k] = _normalize_str_iterable(v, k) + return args diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/ogc/filters.py similarity index 88% rename from dataretrieval/waterdata/filters.py rename to dataretrieval/ogc/filters.py index 5e1c0a67..8ec55bf0 100644 --- a/dataretrieval/waterdata/filters.py +++ b/dataretrieval/ogc/filters.py @@ -1,4 +1,4 @@ -"""CQL ``filter`` support for the Water Data OGC getters. +"""CQL ``filter`` support for the OGC getters. Public: @@ -8,6 +8,8 @@ planner: ``_split_top_level_or`` (clause partitioning), ``_is_chunkable`` (filter-language gate), and ``_check_numeric_filter_pitfall`` (the lexicographic-comparison guard). +``_quote_cql_str`` escapes a single CQL-text string literal, shared by any +getter that *builds* a CQL filter (e.g. ``waterdata.ratings``). Other CQL shapes (``AND``, ``NOT``, ``LIKE``, spatial/temporal predicates, function calls) are forwarded verbatim — only top-level @@ -49,6 +51,18 @@ _QUOTED_STR_RE = re.compile(r"'[^']*'") +def _quote_cql_str(value: str) -> str: + """Escape a single-quoted CQL2-text literal by doubling embedded quotes. + + CQL2 text escapes a ``'`` inside a string literal by doubling it, so + ``O'Brien`` becomes ``O''Brien`` (wrap the result in ``'…'`` at the call + site). Defends against malformed filters / injection on arbitrary user + input. Shared by every getter that builds a CQL-text literal (e.g. the + STAC ``/search`` filter in ``waterdata.ratings``). + """ + return value.replace("'", "''") + + def _split_top_level_or(expr: str) -> list[str]: """Split ``expr`` at each top-level ``OR``, respecting quotes and parens. diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/ogc/progress.py similarity index 96% rename from dataretrieval/waterdata/_progress.py rename to dataretrieval/ogc/progress.py index 0e4963cd..6177c30f 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/ogc/progress.py @@ -1,16 +1,13 @@ -"""A single self-updating status line for paginated / chunked Water Data queries. +"""A single self-updating status line for paginated / chunked OGC queries. -Water Data getters fan out two ways the caller can't see: large multi-value +OGC getters fan out two ways the caller can't see: large multi-value requests are split into URL-length-safe *chunks* (``chunking`` module), and each request follows ``next`` links across an unknown number of *pages* -(``utils._paginate``). This module surfaces that work as one line on stderr, +(``engine._paginate``). This module surfaces that work as one line on stderr, rewritten in place as data arrives:: Retrieving: daily · 6 pages · 2,881 rows · 995/1,000 requests remaining -It replaces the per-page ``logger.info`` calls that previously narrated the same -events one line at a time. - The active reporter lives in a :class:`~contextvars.ContextVar` rather than being threaded through every signature: progress is a cross-cutting concern that the chunk orchestrator (outer, chunk counts) and the page-walking loop (inner, @@ -47,7 +44,7 @@ def _group_int(value: str) -> str: # state. (It does not give concurrent queries sharing one stderr separate # lines — they would still interleave.) _active: contextvars.ContextVar[ProgressReporter | None] = contextvars.ContextVar( - "waterdata_progress", default=None + "ogc_progress", default=None ) # Where to register for an API key. Surfaced once when a query runs without an diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 9b5ca610..7d3fce45 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -9,6 +9,8 @@ from __future__ import annotations +from dataretrieval.ogc.filters import FILTER_LANG + # Public API exports from .api import ( get_channel, @@ -30,7 +32,6 @@ get_stats_por, get_time_series_metadata, ) -from .filters import FILTER_LANG from .nearest import get_nearest_continuous from .ratings import get_ratings from .types import ( diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 336aa09b..b47ea5d7 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,6 +16,7 @@ import httpx import pandas as pd +from dataretrieval.ogc.filters import FILTER_LANG from dataretrieval.utils import ( HTTPX_DEFAULTS, BaseMetadata, @@ -23,7 +24,7 @@ _get, to_str, ) -from dataretrieval.waterdata.filters import FILTER_LANG +from dataretrieval.waterdata import stats from dataretrieval.waterdata.types import ( CODE_SERVICES, METADATA_COLLECTIONS, @@ -45,8 +46,8 @@ _run_sync, _switch_properties_id, _walk_pages, + _with_state, get_ogc_data, - get_stats_data, ) # Set up logger for this module @@ -200,7 +201,7 @@ def get_daily( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -212,6 +213,13 @@ def get_daily( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -252,7 +260,7 @@ def get_daily( >>> # multiple sub-requests so the URL stays under the server's byte >>> # limit. Combined output looks like a single query. >>> sites_df, _ = dataretrieval.waterdata.get_monitoring_locations( - ... state_name="Ohio", + ... state="Ohio", ... site_type="Stream", ... ) >>> df, md = dataretrieval.waterdata.get_daily( @@ -408,7 +416,7 @@ def get_continuous( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -420,6 +428,13 @@ def get_continuous( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -433,7 +448,7 @@ def get_continuous( ... ) >>> # Pull several disjoint time windows in one call via a CQL - >>> # ``filter``. See ``dataretrieval.waterdata.filters`` for the + >>> # ``filter``. See ``dataretrieval.ogc.filters`` for the >>> # full grammar, auto-chunking, and pitfalls. >>> df, md = dataretrieval.waterdata.get_continuous( ... monitoring_location_id="USGS-02238500", @@ -464,6 +479,7 @@ def get_monitoring_locations( district_code: str | Iterable[str] | None = None, country_code: str | Iterable[str] | None = None, country_name: str | Iterable[str] | None = None, + state: str | Iterable[str] | None = None, state_code: str | Iterable[str] | None = None, state_name: str | Iterable[str] | None = None, county_code: str | Iterable[str] | None = None, @@ -545,6 +561,10 @@ def get_monitoring_locations( The code for the country in which the monitoring location is located. country_name : string or iterable of strings, optional The name of the country in which the monitoring location is located. + state : string or iterable of strings, optional + State/territory filter (the recommended parameter). Accepts a full name + (``"Wisconsin"``), a two-letter postal code (``"WI"``), or a two-digit + ANSI/FIPS code (``"55"``). state_code : string or iterable of strings, optional State code. A two-digit ANSI code (formerly FIPS code) as defined by the American National Standards Institute, to define States and @@ -713,7 +733,7 @@ def get_monitoring_locations( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -725,6 +745,13 @@ def get_monitoring_locations( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -744,8 +771,9 @@ def get_monitoring_locations( """ service = "monitoring-locations" - # Build argument dictionary, omitting None values - args = _get_args(locals()) + # Build argument dictionary, omitting None values (resolving the unified + # `state` argument into the OGC `state_name` queryable). + args = _get_args(_with_state(locals(), to="name", into="state_name")) return get_ogc_data(args, service) @@ -757,6 +785,7 @@ def get_time_series_metadata( properties: str | Iterable[str] | None = None, statistic_id: str | Iterable[str] | None = None, hydrologic_unit_code: str | Iterable[str] | None = None, + state: str | Iterable[str] | None = None, state_name: str | Iterable[str] | None = None, last_modified: str | Iterable[str] | None = None, begin: str | Iterable[str] | None = None, @@ -823,6 +852,10 @@ def get_time_series_metadata( to the largest (regions). Each hydrologic unit is identified by a unique hydrologic unit code (HUC) consisting of two to eight digits based on the four levels of classification in the hydrologic unit system. + state : string or iterable of strings, optional + State/territory filter (the recommended parameter). Accepts a full name + (``"Wisconsin"``), a two-letter postal code (``"WI"``), or a two-digit + ANSI/FIPS code (``"55"``). state_name : string or iterable of strings, optional The name of the state or state equivalent in which the monitoring location is located. @@ -937,7 +970,7 @@ def get_time_series_metadata( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -949,6 +982,13 @@ def get_time_series_metadata( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -968,8 +1008,9 @@ def get_time_series_metadata( """ service = "time-series-metadata" - # Build argument dictionary, omitting None values - args = _get_args(locals()) + # Build argument dictionary, omitting None values (resolving the unified + # `state` argument into the OGC `state_name` queryable). + args = _get_args(_with_state(locals(), to="name", into="state_name")) return get_ogc_data(args, service) @@ -998,6 +1039,7 @@ def get_combined_metadata( district_code: str | Iterable[str] | None = None, country_code: str | Iterable[str] | None = None, country_name: str | Iterable[str] | None = None, + state: str | Iterable[str] | None = None, state_code: str | Iterable[str] | None = None, state_name: str | Iterable[str] | None = None, county_code: str | Iterable[str] | None = None, @@ -1106,6 +1148,10 @@ def get_combined_metadata( interval (``"start/end"``, optionally half-bounded with ``..``), or an ISO 8601 duration (e.g. ``"P1M"``, ``"PT36H"``). See :func:`get_time_series_metadata` for the full grammar. + state : string or iterable of strings, optional + State/territory filter (the recommended parameter). Accepts a full + name (``"Wisconsin"``), a two-letter postal code (``"WI"``), or a + two-digit ANSI/FIPS code (``"55"``). state_name, county_name, hydrologic_unit_code, site_type, \ site_type_code : string or iterable of strings, optional Common location-catalog filters carried over from the @@ -1131,7 +1177,7 @@ def get_combined_metadata( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -1143,6 +1189,13 @@ def get_combined_metadata( md : :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object pertaining to the query. + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -1160,7 +1213,7 @@ def get_combined_metadata( >>> # Every series in a single county, useful for area-of-interest workflows >>> df, md = dataretrieval.waterdata.get_combined_metadata( - ... state_name="Wisconsin", county_name="Dane County" + ... state="Wisconsin", county_name="Dane County" ... ) >>> # Inventory across multiple HUCs, restricted to streams and springs @@ -1198,7 +1251,8 @@ def get_combined_metadata( """ service = "combined-metadata" - args = _get_args(locals()) + # Resolve the unified `state` argument into the OGC `state_name` queryable. + args = _get_args(_with_state(locals(), to="name", into="state_name")) return get_ogc_data(args, service) @@ -1347,7 +1401,7 @@ def get_latest_continuous( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -1359,6 +1413,13 @@ def get_latest_continuous( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -1543,7 +1604,7 @@ def get_latest_daily( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -1555,6 +1616,13 @@ def get_latest_daily( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -1731,7 +1799,7 @@ def get_field_measurements( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -1743,6 +1811,13 @@ def get_field_measurements( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -1846,7 +1921,7 @@ def get_field_measurements_metadata( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -1858,6 +1933,13 @@ def get_field_measurements_metadata( md : :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object pertaining to the query. + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -1969,7 +2051,7 @@ def get_peaks( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -1981,6 +2063,13 @@ def get_peaks( md : :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object pertaining to the query. + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -2063,6 +2152,13 @@ def get_reference_table( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object including the URL request and query time. + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: @@ -2443,6 +2539,7 @@ def get_stats_por( approval_status: str | None = None, computation_type: str | Iterable[str] | None = None, country_code: str | Iterable[str] | None = None, + state: str | Iterable[str] | None = None, state_code: str | Iterable[str] | None = None, county_code: str | Iterable[str] | None = None, start_date: str | None = None, @@ -2453,6 +2550,7 @@ def get_stats_por( site_type_code: str | Iterable[str] | None = None, site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, + normal_type: str | None = None, expand_percentiles: bool = True, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get day-of-year and month-of-year water data statistics from the @@ -2478,6 +2576,10 @@ def get_stats_por( arithmetic_mean, maximum, median, minimum, percentile. country_code: string, optional Country query parameter. API defaults to "US". + state: string or iterable of strings, optional + State/territory filter (the recommended parameter). Accepts a full name + ("Wisconsin"), a two-letter postal code ("WI"), or a two-digit + ANSI/FIPS code ("55"). state_code: string, optional State query parameter. Takes the format "US:XX", where XX is the two-digit state code. API defaults to "US:42" (Pennsylvania). @@ -2514,6 +2616,10 @@ def get_stats_por( measured and the units of measure. A complete list of parameter codes and associated groupings can be found at https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + normal_type : string, optional + Filter the returned normals to a single period. If unspecified + (default), all matching data are returned. Available values: + "DOY" (day-of-year) and "MOY" (month-of-year). expand_percentiles : boolean Percentile data for a given day of year or month of year by default are returned from the service as lists of string values and percentile @@ -2563,9 +2669,12 @@ def get_stats_por( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args( + _with_state(locals(), to="fips_us", into="state_code"), + exclude={"expand_percentiles"}, + ) - return get_stats_data( + return stats.get_data( args=params, service="observationNormals", expand_percentiles=expand_percentiles ) @@ -2574,6 +2683,7 @@ def get_stats_date_range( approval_status: str | None = None, computation_type: str | Iterable[str] | None = None, country_code: str | Iterable[str] | None = None, + state: str | Iterable[str] | None = None, state_code: str | Iterable[str] | None = None, county_code: str | Iterable[str] | None = None, start_date: str | None = None, @@ -2584,6 +2694,7 @@ def get_stats_date_range( site_type_code: str | Iterable[str] | None = None, site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, + interval_type: str | Iterable[str] | None = None, expand_percentiles: bool = True, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get monthly and annual water data statistics from the USGS Water Data API. @@ -2608,6 +2719,10 @@ def get_stats_date_range( arithmetic_mean, maximum, median, minimum, percentile. country_code: string, optional Country query parameter. API defaults to "US". + state: string or iterable of strings, optional + State/territory filter (the recommended parameter). Accepts a full name + ("Wisconsin"), a two-letter postal code ("WI"), or a two-digit + ANSI/FIPS code ("55"). state_code: string, optional State query parameter. Takes the format "US:XX", where XX is the two-digit state code. API defaults to "US:42" (Pennsylvania). @@ -2649,6 +2764,10 @@ def get_stats_date_range( measured and the units of measure. A complete list of parameter codes and associated groupings can be found at https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + interval_type : string or iterable of strings, optional + Filter the returned intervals to one or more periods. If unspecified + (default), all matching data are returned. Available values: + "M" (month), "CY" (calendar year), and "WY" (water year). expand_percentiles : boolean Percentile data for a given day of year or month of year by default are returned from the service as lists of string values and percentile @@ -2682,7 +2801,7 @@ def get_stats_date_range( >>> # Get monthly and yearly medians for streamflow at streams in Rhode Island >>> # from calendar year 2024. >>> df, md = dataretrieval.waterdata.get_stats_date_range( - ... state_code="US:44", # State code for Rhode Island + ... state="RI", # Rhode Island (postal code, name, or FIPS all work) ... parameter_code="00060", ... site_type_code="ST", ... start_date="2024-01-01", @@ -2699,9 +2818,12 @@ def get_stats_date_range( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args( + _with_state(locals(), to="fips_us", into="state_code"), + exclude={"expand_percentiles"}, + ) - return get_stats_data( + return stats.get_data( args=params, service="observationIntervals", expand_percentiles=expand_percentiles, @@ -2865,7 +2987,7 @@ def get_channel( filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See - :mod:`dataretrieval.waterdata.filters` for syntax, auto-chunking, + :mod:`dataretrieval.ogc.filters` for syntax, auto-chunking, and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. @@ -2877,6 +2999,13 @@ def get_channel( md: :obj:`dataretrieval.utils.BaseMetadata` A custom metadata object + Raises + ------ + ChunkInterrupted + A transient failure (429 / 5xx / timeout) interrupted the request + after the built-in retries. Completed work is preserved; resume + with ``exc.call.resume()`` (see :doc:`/userguide/errors`). + Examples -------- .. code:: diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index 2ffe5089..de4ba0d4 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -18,6 +18,7 @@ import pandas as pd from dataretrieval.exceptions import DataRetrievalError +from dataretrieval.ogc.filters import _quote_cql_str from dataretrieval.rdb import extract_rdb_comment, read_rdb from dataretrieval.utils import HTTPX_DEFAULTS, _get @@ -206,15 +207,6 @@ def _as_list(x: str | Iterable[str]) -> list[str]: return [x] if isinstance(x, str) else list(x) -def _quote_cql_str(value: str) -> str: - """Escape a single-quoted CQL literal by doubling embedded quotes. - - Defends against malformed filters / injection on arbitrary user input, - even though valid USGS monitoring-location IDs cannot contain a quote. - """ - return value.replace("'", "''") - - def _build_filter( monitoring_location_id: str | list[str] | None, file_type: str | None, diff --git a/dataretrieval/waterdata/stats.py b/dataretrieval/waterdata/stats.py new file mode 100644 index 00000000..608f73ee --- /dev/null +++ b/dataretrieval/waterdata/stats.py @@ -0,0 +1,293 @@ +"""USGS Water Data Statistics API client. + +Wraps ``https://api.waterdata.usgs.gov/statistics/v0`` — the daily-statistics +service (period-of-record and date-range normals/intervals). This is a +*separate*, non-OGC API: it has no chunkable multi-value axes, so it drives +:func:`engine._paginate` directly through a blocking portal rather than going +through ``multi_value_chunked``. The typed getters ``get_stats_por`` and +``get_stats_date_range`` in :mod:`dataretrieval.waterdata.api` call +:func:`get_data` here. +""" + +from __future__ import annotations + +from typing import Any + +import httpx +import pandas as pd + +from dataretrieval.ogc.engine import ( + BASE_URL, + GEOPANDAS, + _attach_coordinates, + _default_headers, + _empty_feature_frame, + _paginate, + _run_sync, +) +from dataretrieval.utils import BaseMetadata + +# ``_handle_nesting``'s geopandas branch calls ``gpd.GeoDataFrame.from_features`` +# directly, so this module needs its own bound ``gpd`` name. Import it under the +# same guard the engine uses; when geopandas is absent ``gpd`` is left unbound +# (``GEOPANDAS`` is ``False``, so the stats path never touches it). The +# empty-page short-circuit instead delegates to ``engine._empty_feature_frame``, +# which resolves the engine's ``gpd`` — so an empty-page test patches +# ``engine.gpd`` while the populated geopandas branch uses ``stats.gpd``. +try: + import geopandas as gpd +except ImportError: # pragma: no cover - exercised only without geopandas + pass + +STATISTICS_API_VERSION = "v0" +STATISTICS_API_URL = f"{BASE_URL}/statistics/{STATISTICS_API_VERSION}" + + +def _handle_nesting( + body: dict[str, Any], + geopd: bool = False, +) -> pd.DataFrame: + """ + Takes nested json from stats service and flattens into a dataframe with + one row per monitoring location, parameter, and statistic. + + Parameters + ---------- + body : Dict[str, Any] + The JSON response body from the statistics service containing nested data. + geopd : bool, optional + Whether ``geopandas`` is available — when ``True`` the returned + frame is a ``GeoDataFrame``; when ``False`` (default) a plain + ``pd.DataFrame`` is returned with geometry flattened. + + Returns + ------- + pd.DataFrame + A DataFrame containing the flattened statistical data. + + Notes + ----- + The non-geopandas branch uses the same schema-aware extraction as + :func:`engine._get_resp_data`: it builds the per-feature outer frame + directly from each feature's ``properties`` (minus the nested + ``data`` field, which is unrolled separately below via the + ``record_path`` json_normalize), then adds ``geometry`` only when + present. Unlike :func:`engine._get_resp_data`, no top-level ``id`` + column is added — stats features don't carry one, so this matches the + geopandas branch. Skipping the GeoJSON envelope keeps newly-added + fields like ``geometry.type`` from leaking into the result. + """ + if body is None: + return _empty_feature_frame(geopd) + + # An empty (or missing) features list — a real mid-pagination + # shape — would otherwise crash the downstream merge with + # ``KeyError: 'monitoring_location_id'`` because neither df nor + # dat would carry the merge key. ``_empty_feature_frame`` bails out + # with a geo-typed empty frame so a later ``pd.concat`` with non-empty + # geo pages doesn't downgrade to a plain DataFrame and strip geometry/CRS. + features = body.get("features") or [] + if not features: + return _empty_feature_frame(geopd) + + # The geopd-missing warning is emitted once at import (see engine module); + # doing it here would log per page. + if not geopd: + outer_props = [ + {k: v for k, v in (f.get("properties") or {}).items() if k != "data"} + for f in features + ] + df = pd.json_normalize(outer_props, sep=".") + df.columns = df.columns.str.split(".").str[-1] + # Stats features don't carry a top-level ``id`` field — the + # geopandas branch (``GeoDataFrame.from_features``) doesn't + # surface one either, so the non-geopd branch stays + # consistent by NOT adding an id column. + _attach_coordinates(df, features) + else: + # Default a missing ``geometry`` key to ``None`` per feature so + # ``from_features`` (which indexes ``feature["geometry"]`` directly) + # can't ``KeyError`` on a stats feature that omits geometry — mirrors + # the guard in :func:`engine._get_resp_data`. + df = gpd.GeoDataFrame.from_features( + [f if "geometry" in f else {**f, "geometry": None} for f in features] + ).drop(columns=["data"], errors="ignore") + + # Unnest json features, properties, data, and values while retaining necessary + # metadata to merge with main dataframe. + dat = pd.json_normalize( + body, + record_path=["features", "properties", "data", "values"], + meta=[ + ["features", "properties", "monitoring_location_id"], + ["features", "properties", "data", "parameter_code"], + ["features", "properties", "data", "unit_of_measure"], + ["features", "properties", "data", "parent_time_series_id"], + ], + meta_prefix="", + errors="ignore", + ) + dat.columns = dat.columns.str.split(".").str[-1] + + return df.merge(dat, on="monitoring_location_id", how="left") + + +def _expand_percentiles(df: pd.DataFrame) -> pd.DataFrame: + """ + Takes percentile value and thresholds columns containing lists + of values and turns each list element into its own row in the + original dataframe. Exploded ``'nan'`` values are dropped. If + no percentile data exist, it adds a percentile column and + populates it with the percentile assigned to min, max, and + median. + + Parameters + ---------- + df : pd.DataFrame + The dataframe returned from using one of the statistics services. + + Returns + ------- + pd.DataFrame + A DataFrame containing the flattened percentile data. + """ + if len(df) > 0: + if "percentile" in df["computation"].unique(): + # Explode percentile lists into rows called "value" and "percentile" + percentiles = df.loc[df["computation"] == "percentile"] + percentiles_explode = percentiles[ + ["computation_id", "values", "percentiles"] + ].explode(["values", "percentiles"], ignore_index=True) + percentiles_explode = percentiles_explode.loc[ + percentiles_explode["values"] != "nan" + ] + percentiles_explode["value"] = pd.to_numeric(percentiles_explode["values"]) + percentiles_explode["percentile"] = pd.to_numeric( + percentiles_explode["percentiles"] + ) + percentiles_explode = percentiles_explode.drop( + columns=["values", "percentiles"] + ) + + # Merge exploded values back to other metadata/geometry + percentiles = percentiles.drop( + columns=["values", "percentiles", "value"], errors="ignore" + ).merge(percentiles_explode, on="computation_id", how="left") + + # Concatenate back to original + dfs = pd.concat( + [df.loc[df["computation"] != "percentile"], percentiles] + ).drop(columns=["values", "percentiles"]) + else: + dfs = df + dfs["percentile"] = pd.NA + + # Give min, max, median a percentile value + dfs.loc[dfs["computation"] == "maximum", "percentile"] = 100 + dfs.loc[dfs["computation"] == "minimum", "percentile"] = 0 + dfs.loc[dfs["computation"] == "median", "percentile"] = 50 + + # Make sure numeric + dfs["percentile"] = pd.to_numeric(dfs["percentile"]) + + # Move percentile column + cols = dfs.columns.tolist() + cols.remove("percentile") + col_index = cols.index("value") + 1 + cols.insert(col_index, "percentile") + + return dfs[cols] + + else: + return df + + +def get_data( + args: dict[str, Any], + service: str, + expand_percentiles: bool, + client: httpx.AsyncClient | None = None, +) -> tuple[pd.DataFrame, BaseMetadata]: + """ + Retrieves statistical data from a specified endpoint and returns it + as a pandas DataFrame with metadata. + + This function prepares request arguments, constructs API requests, + handles pagination, processes results, and formats output according + to the specified parameters. + + The stats path doesn't go through ``multi_value_chunked`` (its query + shape has no chunkable list axes), so it drives :func:`engine._paginate` + directly through an ``anyio`` blocking portal. The portal runs the + pagination loop in a short-lived worker thread, so this works whether + or not the caller is already inside an event loop. + + Parameters + ---------- + args : Dict[str, Any] + Dictionary of request arguments for the statistics service. + service : str + The statistics service type (for example, + "observationNormals" or "observationIntervals"). + expand_percentiles : bool + Determines whether the percentiles column is expanded so that + each percentile gets its own row in the returned dataframe. If + True and the user requests a computation_type other than + percentiles, a percentile column is still returned. + client : httpx.AsyncClient, optional + Caller-borrowed async client. ``None`` (default) opens a + temporary one inside the portal. Primarily a test seam. + + Returns + ------- + pd.DataFrame + A DataFrame containing the retrieved and processed statistical data. + BaseMetadata + A metadata object containing request information including URL and query time. + + Raises + ------ + DataRetrievalError + The typed subclass for an HTTP error response (see :func:`engine._paginate`); + or :class:`~dataretrieval.exceptions.NetworkError` if the initial request + can't reach the service (timeout / DNS), the ``httpx`` exception chained + on ``__cause__``. + """ + + url = f"{STATISTICS_API_URL}/{service}" + req = httpx.Request( + method="GET", + url=url, + headers=_default_headers(), + params=args, + ) + method = req.method + headers = req.headers + + def parse_response(resp: httpx.Response) -> tuple[pd.DataFrame, str | None]: + body = resp.json() + # Coerce falsy cursors ("", 0) to None so _paginate terminates. + # USGS uses "next": null at end-of-stream, but defensive coerce + # protects against any "" sentinel a future schema might use. + return _handle_nesting(body, geopd=GEOPANDAS), body.get("next") or None + + async def follow_up(cursor: str, sess: httpx.AsyncClient) -> httpx.Response: + # Build a fresh params dict per page so the caller's ``args`` + # is never mutated. + return await sess.request( + method, url=url, params={**args, "next_token": cursor}, headers=headers + ) + + async def _run() -> tuple[pd.DataFrame, httpx.Response]: + return await _paginate( + req, + parse_response=parse_response, + follow_up=follow_up, + client=client, + ) + + df, response = _run_sync(_run, service=service) + + if expand_percentiles: + df = _expand_percentiles(df) + return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index e8c18094..4dc7afd3 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -1,1452 +1,180 @@ -from __future__ import annotations - -import copy -import functools -import json -import logging -import numbers -import os -import re -from collections.abc import ( - AsyncIterator, - Awaitable, - Callable, - Iterable, - Iterator, - Mapping, - Sequence, -) -from contextlib import asynccontextmanager, contextmanager -from contextvars import ContextVar -from datetime import datetime, timedelta -from typing import Any, TypeVar, cast, get_args -from zoneinfo import ZoneInfo - -import httpx -import pandas as pd -from anyio.from_thread import start_blocking_portal - -from dataretrieval import __version__ -from dataretrieval.exceptions import DataRetrievalError, RateLimited, error_for_status -from dataretrieval.utils import HTTPX_DEFAULTS, BaseMetadata, _get, _network_error -from dataretrieval.waterdata import _progress, chunking -from dataretrieval.waterdata.chunking import ( - _QUOTA_HEADER, - _safe_elapsed, - get_active_client, -) -from dataretrieval.waterdata.types import ( - PROFILE_LOOKUP, - PROFILES, - SERVICES, -) - -try: - import geopandas as gpd - - GEOPANDAS = True -except ImportError: - GEOPANDAS = False - -# Set up logger for this module -logger = logging.getLogger(__name__) - -# Whether geopandas is present is a static, environment-level fact, so warn once -# here at import time rather than per query/chunk. That avoids the warning -# repeating on every call and avoids it interleaving with the progress line's -# carriage-return rewrites. -if not GEOPANDAS: - logger.warning( - "Geopandas not installed. Geometries will be flattened into pandas DataFrames." - ) - -BASE_URL = "https://api.waterdata.usgs.gov" -OGC_API_VERSION = "v0" -OGC_API_URL = f"{BASE_URL}/ogcapi/{OGC_API_VERSION}" -SAMPLES_URL = f"{BASE_URL}/samples-data" -STATISTICS_API_VERSION = "v0" -STATISTICS_API_URL = f"{BASE_URL}/statistics/{STATISTICS_API_VERSION}" - -# Maps each OGC waterdata service to its user-facing ``id`` column (the name the -# typed getters rename the wire ``id`` to, e.g. ``daily`` -> ``daily_id``). -# ``get_cql`` validates its ``service`` argument against these keys and -# uses the value as the ``output_id`` for result shaping. Keep in sync with the -# ``types.WATERDATA_SERVICES`` Literal (same keys). -_OUTPUT_ID_BY_SERVICE: dict[str, str] = { - "channel-measurements": "channel_measurements_id", - "combined-metadata": "combined_meta_id", - "continuous": "continuous_id", - "daily": "daily_id", - "field-measurements": "field_measurement_id", - "field-measurements-metadata": "field_series_id", - "latest-continuous": "latest_continuous_id", - "latest-daily": "latest_daily_id", - "monitoring-locations": "monitoring_location_id", - "peaks": "peak_id", - "time-series-metadata": "time_series_id", -} - -# Every service's output id EXCEPT the two that are genuinely user-facing -# (``monitoring_location_id`` and ``time_series_id``). The rest are synthetic -# per-record ids that ``_arrange_cols`` moves to the end of a result frame. -# Derived from ``_OUTPUT_ID_BY_SERVICE`` so adding a service can't silently -# leave a stray id column at the front again. -_EXTRA_ID_COLS = set(_OUTPUT_ID_BY_SERVICE.values()) - { - "monitoring_location_id", - "time_series_id", -} - - -def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str) -> dict[str, Any]: - """ - Switch argument id from its package-specific identifier to the standardized "id" key - that the API recognizes. - - If `ls` does not already have an "id" key, sets it from either the - service-derived id key or the expected id column name. If neither key - exists, "id" is left unset. The original service-specific id keys are - removed regardless. - - Parameters - ---------- - ls : Dict[str, Any] - The dictionary containing identifier keys to be standardized. - id_name : str - The name of the specific identifier key to look for. - service : str - The service name. - - Returns - ------- - Dict[str, Any] - The modified dictionary with the "id" key set appropriately. - - Examples - -------- - For service "time-series-metadata", the function will look for either - "time_series_metadata_id" or "time_series_id" and change the key to simply - "id". - """ - - service_id = service.replace("-", "_") + "_id" - - if "id" not in ls: - if service_id in ls: - ls["id"] = ls[service_id] - elif id_name in ls: - ls["id"] = ls[id_name] - - # Remove the original keys regardless of whether they were used - ls.pop(service_id, None) - ls.pop(id_name, None) - - return ls - - -def _switch_properties_id( - properties: list[str] | None, id_name: str, service: str -) -> list[str]: - """ - Build the wire ``properties`` list, dropping every id alias and - ``geometry``. - - The feature ``id`` is always returned and is renamed to the - service-specific id column (e.g. ``daily_id``) in post-processing, so - it must not be requested as a property: several collections (e.g. - ``daily``, ``continuous``) reject ``id`` in ``properties`` with an - HTTP 400. ``geometry`` is likewise excluded because it is controlled - by ``skip_geometry``. Any service-specific id name (``daily_id``, - ``monitoring_location_id``, …) and the bare ``id`` are dropped, and - remaining hyphens are normalized to underscores. Returns an empty - list when `properties` is empty or None — the URL then omits the - ``properties`` filter and the result is shaped by :func:`_arrange_cols`. - - Parameters - ---------- - properties : Optional[List[str]] - A list containing the properties or column names to be pulled from the - service, or None. - id_name : str - The service-specific id column name to drop (e.g. ``daily_id``). - service : str - The service name. - - Returns - ------- - List[str] - The wire ``properties`` with id aliases and ``geometry`` removed - and hyphens normalized. - - Examples - -------- - For service "daily" with ``properties=["daily_id", "value", "geometry"]``, - returns ``["value"]`` — ``daily_id`` and ``geometry`` are dropped, while - the ``daily_id`` column still appears in the result, renamed from the - always-returned feature ``id``. - """ - if not properties: - return [] - service_id = service.replace("-", "_") + "_id" - # The feature ``id`` always comes back (renamed to the service id - # downstream) and several collections reject it as a selectable - # property; ``geometry`` is controlled by ``skip_geometry``. Drop both, - # plus the service-specific id column (``id_name``) and the name derived - # straight from the service (``service_id``). - drop = {"id", "geometry", id_name, service_id} - normalized = (p.replace("-", "_") for p in properties) - return [p for p in normalized if p not in drop] - - -_DATETIME_FORMATS = ( - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%dT%H:%M:%S.%f", - "%Y-%m-%dT%H:%M:%S", - "%Y-%m-%d %H:%M:%S.%f", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d", -) - -# Anchored to ``[Pp]\d`` so a normal word containing ``p`` (e.g. ``"Apr"``) -# doesn't get mis-classified as an ISO 8601 duration; the optional ``T`` -# admits time-only forms like ``PT36H``. -_DURATION_RE = re.compile(r"^[Pp]T?\d") - -# OGC API parameters that carry a date/datetime value (single string, -# two-element range, or interval/duration string) rather than a multi-value -# string list. Used by ``_construct_api_requests`` to keep them out of the -# POST/CQL2 multi-value path and to route them through ``_format_api_dates``, -# and by ``_NO_NORMALIZE_PARAMS`` to bypass string-iterable normalization. -_DATE_RANGE_PARAMS = frozenset( - {"datetime", "last_modified", "begin", "begin_utc", "end", "end_utc", "time"} -) - -# Services that don't support comma-separated values for multi-value GET -# parameters and require POST with CQL2 JSON instead. -_CQL2_REQUIRED_SERVICES = frozenset({"monitoring-locations"}) - - -def _parse_datetime(value: str) -> datetime | None: - """Parse a single datetime string against the supported formats. - - Returns a ``datetime`` (tz-aware iff the input carried a UTC offset), - or ``None`` if no format matched. - """ - # ``datetime.strptime`` accepts a numeric offset like ``+00:00`` but not - # the ``Z`` shorthand, so normalize trailing ``Z`` first. - candidate = value[:-1] + "+00:00" if value.endswith("Z") else value - for fmt in _DATETIME_FORMATS: - try: - return datetime.strptime(candidate, fmt) - except ValueError: - continue - return None - - -def _format_one(dt: str | None, *, date: bool) -> str | None: - """Format a single datetime element for inclusion in the API time arg.""" - if pd.isna(dt) or dt == "" or dt is None: - return ".." - parsed = _parse_datetime(dt) - if parsed is None: - return None - if date: - return parsed.strftime("%Y-%m-%d") - # Naive inputs are interpreted in the system local zone (for backwards - # compatibility). Use ``.astimezone()`` rather than a fixed offset so each - # value is resolved against the DST rules for ITS OWN date — a frozen - # ``datetime.now()`` offset shifted off-season inputs by an hour. - aware = parsed if parsed.tzinfo is not None else parsed.astimezone() - return aware.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") - - -def _format_api_dates( - datetime_input: str | Sequence[str | None] | None, date: bool = False -) -> str | None: - """ - Formats date or datetime input(s) for use with an API. - - Handles single values or ranges, and converting to ISO 8601 or date-only - formats as needed. - - Parameters - ---------- - datetime_input : Union[str, List[Optional[str]], None] - A single date/datetime string or a list of one or two date/datetime - strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601 (with or - without ``Z``/numeric offset), or relative periods (e.g., "P7D" / - "PT36H"). Range endpoints may be ``None``/``NaN``/empty to denote a - half-bounded range. - date : bool, optional - If True, uses only the date portion ("YYYY-MM-DD"). If False (default), - returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). - - Returns - ------- - Union[str, None] - - If input is a single value, returns the formatted date/datetime string - or None if parsing fails. - - If input is a list of two values, returns a date/datetime range string - separated by "/" (e.g., "YYYY-MM-DD/YYYY-MM-DD" or - "YYYY-MM-DDTHH:MM:SSZ/YYYY-MM-DDTHH:MM:SSZ"). - - Returns None if input is empty, all NA, or cannot be parsed. - - Raises - ------ - ValueError - If `datetime_input` contains more than two values. - - Notes - ----- - - A single blank/NA value returns None. In a two-value range, a blank/NA - endpoint is rendered as ``".."`` to denote an open bound (e.g. - ``"2024-01-01/.."``); the range is only None when *every* element is - blank/NA or any non-NA element fails to parse. - - Supports ISO 8601 durations such as "P7D" and "PT36H" and pre-formatted - intervals containing ``"/"``; both are passed through unchanged. - - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when - `date` is False. Inputs with an explicit offset (``Z`` or ``+HH:MM``) are - converted from that offset to UTC; naive inputs are interpreted in the - local time zone for backwards compatibility. - """ - if datetime_input is None: - return None - - # Convert single string to list for uniform processing - if isinstance(datetime_input, str): - datetime_input = [datetime_input] - elif isinstance(datetime_input, Mapping): - # `list(mapping)` returns keys, which silently accepts the wrong shape. - raise TypeError( - f"date input must be a string or sequence of strings, " - f"not {type(datetime_input).__name__}." - ) - elif not isinstance(datetime_input, (list, tuple)): - # Materialize any other iterable (pandas.Series, numpy.ndarray, - # generator, ...) so the len()/subscript operations below work. - datetime_input = list(datetime_input) - - # Check for null or all NA and return None - if all(pd.isna(dt) or dt == "" or dt is None for dt in datetime_input): - return None - - if len(datetime_input) > 2: - raise ValueError("datetime_input should only include 1-2 values") - - # Pass through duration ("P7D", "PT36H") and pre-formatted interval ("a/b") - # strings untouched. - if len(datetime_input) == 1 and isinstance(datetime_input[0], str): - single = datetime_input[0] - if _DURATION_RE.match(single) or "/" in single: - return single - - # element invalidates the range. - formatted: list[str] = [] - for dt in datetime_input: - one = _format_one(dt, date=date) - if one is None: - return None - formatted.append(one) - return "/".join(formatted) - - -def _cql2_param(args: dict[str, Any]) -> str: - """ - Convert query parameters to CQL2 JSON format for POST requests. - - Parameters - ---------- - args : Dict[str, Any] - Dictionary of query parameters to convert to CQL2 format. - - Returns - ------- - str - Compact JSON string representation of the CQL2 query. - - Notes - ----- - Serialized with the tightest separators (no indentation or - whitespace). The body counts against the server's ~8 KB request-size - limit and against :func:`chunking._request_bytes` when planning - chunks, so every saved byte fits more values per POST: compact - encoding roughly halves the per-value cost versus pretty-printing, - which roughly doubles how many monitoring-location ids fit in one - sub-request and so halves the chunk count for large id lists. - """ - filters = [] - for key, values in args.items(): - filters.append({"op": "in", "args": [{"property": key}, values]}) - - query = {"op": "and", "args": filters} - - return json.dumps(query, separators=(",", ":")) - - -def _default_headers() -> dict[str, str]: - """ - Generate default HTTP headers for API requests. - - Returns - ------- - dict - A dictionary containing default headers including 'Accept-Encoding', - 'Accept', 'User-Agent', and 'lang'. If the environment variable - 'API_USGS_PAT' is set, its value is included as the 'X-Api-Key' header. - """ - headers = { - "Accept-Encoding": "compress, gzip", - "Accept": "application/json", - "User-Agent": f"python-dataretrieval/{__version__}", - "lang": "en-US", - } - token = os.getenv("API_USGS_PAT") - if token: - headers["X-Api-Key"] = token - return headers - - -def _check_ogc_requests( - endpoint: str = "daily", req_type: str = "queryables" -) -> dict[str, Any]: - """ - Sends an HTTP GET request to the specified OGC endpoint and request type, - returning the JSON response. - - Parameters - ---------- - endpoint : str, optional - The OGC collection endpoint to query (default is "daily"). - req_type : str, optional - The type of request to make. Must be either "queryables" or "schema" - (default is "queryables"). - - Returns - ------- - dict - The JSON response from the OGC endpoint. - - Raises - ------ - ValueError - If req_type is not "queryables" or "schema". - DataRetrievalError - From :func:`_raise_for_non_200` on any non-200 (the typed subclass for - the status) — same typed contract as the main data path so callers can - use one ``except`` clause everywhere. - """ - if req_type not in ("queryables", "schema"): - raise ValueError(f"req_type must be 'queryables' or 'schema', got {req_type!r}") - url = f"{OGC_API_URL}/collections/{endpoint}/{req_type}" - resp = _get(url, headers=_default_headers(), **HTTPX_DEFAULTS) - _raise_for_non_200(resp) - # ``Response.json`` is typed ``Any``; the OGC queryables/schema endpoints - # return a JSON object, and callers index it as a dict. - return cast("dict[str, Any]", resp.json()) - - -def _error_body(resp: httpx.Response) -> str: - """ - Build an informative error message from an HTTP response. - - Parameters - ---------- - resp : httpx.Response - The HTTP response object to extract the error message from. - - Returns - ------- - str - An error message string assembled per status code: - - * **429** — predefined message describing the rate-limit and pointing - at the API-token path; the response body is not consulted. - * **403** — predefined message describing the most common cause - (query exceeding server limits); the response body is not - consulted. - * **other statuses** — attempts ``resp.json()`` and renders - ``": . ."`` from the JSON error - envelope. If the body is not JSON (e.g. an HTML 502 from a - gateway), falls back to ``": . "`` with - the first 200 characters of ``resp.text``; an empty body - degrades to ``": ."``. - """ - status = resp.status_code - if status == 429: - return ( - "429: Too many requests made. Please obtain an API token " - "or try again later." - ) - elif status == 403: - return ( - "403: Query request denied. Possible reasons include " - "query exceeding server limits." - ) - try: - j_txt = resp.json() - except ValueError: - snippet = (resp.text or "").strip()[:200] - reason = resp.reason_phrase or "Error" - if snippet: - return f"{status}: {reason}. {snippet}" - return f"{status}: {reason}." - return ( - f"{status}: {j_txt.get('code', 'Unknown type')}. " - f"{j_txt.get('description', 'No description provided')}." - ) - - -def _parse_retry_after(value: str | None) -> float | None: - """ - Parse a USGS ``Retry-After`` header into seconds. - - Parameters - ---------- - value : str or None - The raw header value, or ``None`` if absent. - - Returns - ------- - float or None - Non-negative delta-seconds, clamped at zero. ``None`` when the - header is absent or unparseable; ``ChunkedCall`` treats - ``None`` as "fall back to my own retry policy". - - Notes - ----- - USGS sends ``Retry-After`` as integer delta-seconds (empirically - verified — e.g. ``Retry-After: 2619``). The HTTP spec also allows - HTTP-date form, but USGS doesn't use it, so this function doesn't - bother parsing it. - """ - if not value: - return None - try: - return max(0.0, float(value.strip())) - except ValueError: - return None - - -def _raise_for_non_200(resp: httpx.Response) -> None: - """ - Raise a typed exception for any non-200 response. - - Routes through :func:`_error_body` (USGS-API-aware: handles - 429/403 specially, extracts ``code``/``description`` from JSON - error bodies) rather than ``Response.raise_for_status``, which - raises ``HTTPStatusError`` with a generic message. - - Parameters - ---------- - resp : httpx.Response - The HTTP response to inspect. - - Raises - ------ - DataRetrievalError - The typed subclass for the status (see - :func:`dataretrieval.exceptions.error_for_status` for the mapping). The - transient types (:class:`~dataretrieval.exceptions.TransientError`) are - distinguished so ``ChunkedCall`` can wrap them as a resumable - :class:`~dataretrieval.waterdata.chunking.QuotaExhausted` / - :class:`~dataretrieval.waterdata.chunking.ServiceInterrupted`; a fatal - :class:`~dataretrieval.exceptions.HTTPError` (not a ``TransientError``) - the chunker won't resume. - """ - status = resp.status_code - if status < 400: - return - raise error_for_status( - status, - _error_body(resp), - retry_after=_parse_retry_after(resp.headers.get("Retry-After")), - ) - - -def _paginated_failure_message(pages_collected: int, cause: BaseException) -> str: - """ - Build a user-facing message for a mid-pagination failure. - - The API exposes no resume cursor, so the caller's only recovery is - to retry the whole call — the message lists the practical knobs, - tailored to whether the failure was rate-limit (429) or something - else. - - Parameters - ---------- - pages_collected : int - Number of pages successfully fetched before the failure. - cause : BaseException - The underlying exception that interrupted pagination. - - Returns - ------- - str - A message suitable for the ``DataRetrievalError`` that - ``_walk_pages`` and ``get_stats_data`` raise from the - original exception. - """ - cause_str = str(cause).removesuffix(".") - # Some ``httpx`` exceptions (e.g. ``TimeoutException()`` with no args) - # stringify to empty; fall back to the class name so the - # returned message is always informative. - if not cause_str.strip(): - cause_str = type(cause).__name__ - if isinstance(cause, RateLimited): - action = "wait for the rate-limit window to reset and retry" - else: - action = "retry the request (possibly after a short backoff)" - return ( - f"Paginated request failed after collecting {pages_collected} " - f"page(s): {cause_str}. To recover: {action}, reduce the " - f"request size (e.g. fewer locations, a shorter time range, or " - f"a smaller ``limit``), or obtain an API token." - ) - - -def _ogc_query_params( - params: dict[str, Any], - *, - properties: list[str] | None, - bbox: list[float] | None, - limit: int | None, - skip_geometry: bool | None, -) -> dict[str, Any]: - """Add the shared OGC query knobs to ``params`` (mutated in place). - - Factors out the ``skipGeometry``/``limit``/``bbox``/``properties`` block - common to every OGC request so the typed getters - (:func:`_construct_api_requests`) and the generalized CQL2 path - (:func:`_construct_cql_request`) build identical URL parameters. - - ``skip_geometry=None`` leaves ``skipGeometry`` unset (the server defaults to - including geometry); the typed getters always pass a bool, so their behavior - is unchanged. - """ - if skip_geometry is not None: - params["skipGeometry"] = skip_geometry - params["limit"] = 50000 if limit is None or limit > 50000 else limit - # `len()` instead of truthiness: a numpy ndarray would raise on `if bbox:`. - if bbox is not None and len(bbox) > 0: - params["bbox"] = ",".join(map(str, bbox)) - if properties: - params["properties"] = ",".join(properties) - return params - - -def _construct_api_requests( - service: str, - properties: list[str] | None = None, - bbox: list[float] | None = None, - limit: int | None = None, - skip_geometry: bool = False, - **kwargs: Any, -) -> httpx.Request: - """ - Constructs an HTTP request object for the specified water data API service. - - For most services, list parameters are comma-joined and sent as a single - GET request (e.g. ``parameter_code=["00060","00010"]`` becomes - ``parameter_code=00060,00010`` in the URL). For services that do not - support comma-separated values (currently only ``monitoring-locations``), - a POST request with CQL2 JSON is used instead. - - Parameters - ---------- - service : str - The name of the API service to query (e.g., "daily"). - properties : Optional[List[str]], optional - List of property names to include in the request. - bbox : Optional[List[float]], optional - Bounding box coordinates as a list of floats. - limit : Optional[int], optional - Maximum number of results to return per request. - skip_geometry : bool, optional - Whether to exclude geometry from the response (default is False). - **kwargs - Additional query parameters, including date/time filters and other - API-specific options. - - Returns - ------- - httpx.Request - The constructed HTTP request object ready to be sent. - - Notes - ----- - - Date/time parameters are automatically formatted to ISO8601. - """ - service_url = f"{OGC_API_URL}/collections/{service}/items" - - # Format date/time parameters to ISO8601 first — both routing paths need it. - for key in _DATE_RANGE_PARAMS: - if key in kwargs: - kwargs[key] = _format_api_dates( - kwargs[key], - date=(service == "daily" and key != "last_modified"), - ) - - if service in _CQL2_REQUIRED_SERVICES: - # POST with CQL2 JSON: multi-value params go in the request body. - # The date-range loop above has already collapsed any _DATE_RANGE_PARAMS - # value to a string, so the list/tuple check below cannot match them. - post_params = { - k: v - for k, v in kwargs.items() - if isinstance(v, (list, tuple)) and len(v) > 1 - } - params = {k: v for k, v in kwargs.items() if k not in post_params} - else: - # GET with comma-separated values: join list/tuple values into one string. - # Skip empty lists/tuples so they're omitted rather than emitted as a - # filterless ``¶m=`` (which the server reads as "match empty"). - post_params = {} - params = { - k: ",".join(str(x) for x in v) if isinstance(v, (list, tuple)) else v - for k, v in kwargs.items() - if not (isinstance(v, (list, tuple)) and len(v) == 0) - } - - _ogc_query_params( - params, - properties=properties, - bbox=bbox, - limit=limit, - skip_geometry=skip_geometry, - ) - - # Translate CQL filter Python names to the hyphenated URL parameter that - # the OGC API expects. The Python kwarg is `filter_lang` because hyphens - # aren't valid in Python identifiers. - if "filter_lang" in params: - params["filter-lang"] = params.pop("filter_lang") - - headers = _default_headers() - - if post_params: - headers["Content-Type"] = "application/query-cql-json" - return httpx.Request( - method="POST", - url=service_url, - headers=headers, - content=_cql2_param(post_params), - params=params, - ) - return httpx.Request( - method="GET", - url=service_url, - headers=headers, - params=params, - ) - - -def _construct_cql_request( - service: str, - cql_body: str, - *, - properties: list[str] | None = None, - bbox: list[float] | None = None, - limit: int | None = None, - skip_geometry: bool | None = None, -) -> httpx.Request: - """Build a POST/CQL2 request from a verbatim CQL2 body. - - The OGC-API counterpart to :func:`_construct_api_requests` for the - generalized :func:`~dataretrieval.waterdata.api.get_cql` path: the - caller supplies an already-serialized CQL2 JSON document (any predicate the - grammar allows), sent unchanged as the request body, while - ``properties``/``bbox``/``limit``/``skip_geometry`` go on the URL via the - shared :func:`_ogc_query_params` — so a generalized query and an equivalent - typed getter produce the same URL parameters. - - Parameters - ---------- - service : str - OGC collection name (e.g. ``"daily"``). - cql_body : str - Serialized CQL2 JSON document, sent as the POST body verbatim. - properties, bbox, limit, skip_geometry - See :func:`_ogc_query_params`. ``properties`` are wire-format - (``id``-translated) names. - - Returns - ------- - httpx.Request - A POST request with ``Content-Type: application/query-cql-json``. - """ - service_url = f"{OGC_API_URL}/collections/{service}/items" - params = _ogc_query_params( - {}, - properties=properties, - bbox=bbox, - limit=limit, - skip_geometry=skip_geometry, - ) - headers = _default_headers() - headers["Content-Type"] = "application/query-cql-json" - return httpx.Request( - method="POST", - url=service_url, - headers=headers, - content=cql_body, - params=params, - ) - - -def _next_req_url( - resp: httpx.Response, *, body: dict[str, Any] | None = None -) -> str | None: - """ - Extracts the URL for the next page of results from an HTTP response from a - water data endpoint. - - Parameters - ---------- - resp : httpx.Response - The HTTP response object containing JSON data and headers. - body : dict, optional - Pre-parsed JSON body for ``resp``. When provided, skips the - ``resp.json()`` call — useful when the caller has already - decoded the body for its own use (avoids a second parse pass). - - Returns - ------- - Optional[str] - The URL for the next page of results if available, otherwise None. - - Notes - ----- - - Returns None when the response carries no features. - - Expects the response JSON to contain a "links" list with objects having - "rel" and "href" keys. - - Checks for the "next" relation in the "links" to determine the next URL. - """ - if body is None: - body = resp.json() - if not body.get("numberReturned"): - return None - for link in body.get("links", []): - if link.get("rel") != "next": - continue - href = link.get("href") - if not href: - return None - # Refuse to follow a next-page link to a different host — - # the request's headers/auth were minted for the original - # host and shouldn't leak to whatever a poisoned response - # body might supply. Guarded against mock-shaped ``resp.url`` - # attributes (tests sometimes set strings or ``MagicMock``) - # by falling open when host extraction isn't reliable. - next_host: str | None - cur_host: str | None - try: - next_host = httpx.URL(href).host - resp_url = ( - resp.url - if isinstance(resp.url, httpx.URL) - else httpx.URL(str(resp.url)) - ) - cur_host = resp_url.host - except (httpx.InvalidURL, TypeError): - next_host = cur_host = None - if next_host and cur_host and next_host != cur_host: - raise RuntimeError( - f"Refusing to follow cross-host next-page URL: " - f"{next_host} != {cur_host}" - ) - # ``href`` comes from the JSON ``links`` array (typed ``Any``); the - # ``not href`` guard above already excluded empty/None, and it is a - # URL string (passed to ``httpx.URL`` above). - return cast("str", href) - return None - - -def _get_resp_data( - resp: httpx.Response, - geopd: bool, - *, - body: dict[str, Any] | None = None, -) -> pd.DataFrame: - """ - Extracts and normalizes data from an HTTP response containing GeoJSON features. - - Parameters - ---------- - resp : httpx.Response - The HTTP response object expected to contain a JSON body - with a "features" key. - geopd : bool - Indicates whether geopandas is installed and should be used to - handle geometries. - body : dict, optional - Pre-parsed JSON body for ``resp``. When provided, skips the - ``resp.json()`` call — useful when the caller has already - decoded the body for its own use (avoids a second parse pass). - - Returns - ------- - gpd.GeoDataFrame or pd.DataFrame - A ``GeoDataFrame`` when ``geopd`` is True; otherwise a plain - ``DataFrame`` carrying the feature properties plus an ``id`` - column (always present, possibly all-None) and a ``geometry`` - column (coordinates list) when at least one feature includes - geometry. Returns an empty ``DataFrame`` when no features are - returned. - - Notes - ----- - The non-geopandas branch builds the frame directly from each - feature's ``properties`` dict, plus the top-level ``id`` and - ``geometry.coordinates`` columns — the ``id`` column is always - added (so the downstream rename to the service-specific output id - works even on an all-None id), while the ``geometry`` column is - added only when at least one feature carries geometry. This skips - the GeoJSON envelope entirely, so - newly-added Feature-level fields (e.g. ``geometry.type`` after - USGS migrated to full GeoJSON geometry objects) can't leak into - the result frame; no reactive drop-list needs maintenance every - time the upstream schema grows. - """ - if body is None: - body = resp.json() - if not body.get("numberReturned"): - # Preserve the GeoDataFrame type on empty short-circuit so a - # downstream ``pd.concat([empty_page, geo_page])`` doesn't - # downgrade the geopd-installed user's result to a plain - # DataFrame (stripping geometry/CRS). - return gpd.GeoDataFrame() if geopd else pd.DataFrame() - - # Defensive: a 200 with ``numberReturned > 0`` but missing - # ``features`` is a real schema-drift shape (mirrors the guard in - # ``_handle_stats_nesting``). Treat as empty rather than crash with - # ``KeyError`` — the wrapped failure would otherwise look like a - # transient transport error to ``_paginate``'s exception handler. - features = body.get("features") or [] - if not features: - return gpd.GeoDataFrame() if geopd else pd.DataFrame() - - if not geopd: - df = pd.json_normalize([f.get("properties") or {} for f in features], sep="_") - # Always materialize the ``id`` column (may be all-None) so - # ``_arrange_cols``'s ``df.rename(columns={"id": output_id})`` - # produces the documented service-specific output_id column - # (daily_id, channel_measurements_id, …) even if the upstream - # response carried no feature-level id. - df["id"] = [f.get("id") for f in features] - geoms = [(f.get("geometry") or {}).get("coordinates") for f in features] - if any(g is not None for g in geoms): - df["geometry"] = geoms - return df - - # Organize json into geodataframe and make sure id column comes along. - df = gpd.GeoDataFrame.from_features(features) - # Mirror the non-geopandas branch's defensive ``f.get("id")`` so a feature - # missing a top-level ``id`` yields None rather than a KeyError. - df["id"] = [f.get("id") for f in features] - df = df[["id"] + [col for col in df.columns if col != "id"]] - - # If no geometry present, then return pandas dataframe. A geodataframe - # is not needed. - if df["geometry"].isnull().all(): - df = pd.DataFrame(df.drop(columns="geometry")) - - return df - - -@asynccontextmanager -async def _client_for( - client: httpx.AsyncClient | None, -) -> AsyncIterator[httpx.AsyncClient]: - """ - Yield a usable async client, picking the best available source. - - Resolution order: - - 1. ``client`` if the caller supplied one (borrowed; not closed - here — the caller owns its lifecycle). - 2. The chunker's shared async client if we're inside a - :class:`~dataretrieval.waterdata.chunking.ChunkedCall` run (per - :func:`chunking.get_active_client`). Borrowed; the chunker - closes it on exit. - 3. A fresh short-lived ``httpx.AsyncClient`` opened here and closed - on context exit. - - Parameters - ---------- - client : httpx.AsyncClient or None - A caller-owned client to borrow, or ``None`` to defer to the - chunker's shared client or a temporary one. - - Yields - ------ - httpx.AsyncClient - The chosen client. - """ - if client is not None: - yield client - return - shared = get_active_client() - if shared is not None: - yield shared - return - async with httpx.AsyncClient(**HTTPX_DEFAULTS) as new: - yield new - - -def _aggregate_paginated_response( - initial: httpx.Response, - last: httpx.Response, - total_elapsed: timedelta, -) -> httpx.Response: - """ - Build a single response covering a paginated call. - - Returns a shallow copy of ``initial`` with ``.headers`` set to the - LAST page's (so downstream sees current ``x-ratelimit-remaining``) - and ``.elapsed`` set to total wall-clock. The canonical - ``initial.url`` is preserved (it's the user's original query). - Both ``initial`` and ``last`` are left unmutated, mirroring the - convention of - :func:`dataretrieval.waterdata.chunking._combine_chunk_responses`. - - Parameters - ---------- - initial : httpx.Response - First-page response (the canonical one for ``md.url``). - last : httpx.Response - Last-page response — supplies the headers to copy over. - total_elapsed : datetime.timedelta - Cumulative wall-clock across every page, including ``initial``. - - Returns - ------- - httpx.Response - A shallow copy of ``initial`` with ``.headers`` set to a fresh - ``httpx.Headers`` and ``.elapsed`` set to the cumulative - wall-clock. ``initial.headers`` / ``initial.elapsed`` are - never mutated, so callers holding a pre-pagination reference - still see the original first-page values. - """ - final = copy.copy(initial) - final.headers = httpx.Headers(last.headers) - final.elapsed = total_elapsed - return final - - -_Cursor = TypeVar("_Cursor") - -# Optional cap on the total rows a single paginated call accumulates before it -# stops following ``next`` links. ``None`` (the default the data getters use) -# means "no cap — fetch the whole series". Set via :func:`_row_cap` so the deep -# ``_paginate`` loop can honor it without threading the value through the -# generic chunker; this mirrors the ``_progress`` ambient-reporter pattern. -_row_cap_var: ContextVar[int | None] = ContextVar("waterdata_row_cap", default=None) - - -@contextmanager -def _row_cap(max_rows: int | None) -> Iterator[None]: - """Cap the rows any :func:`_paginate` under this context will - accumulate (``None`` = uncapped). Used by :func:`get_reference_table` - to preview large tables without downloading every page.""" - token = _row_cap_var.set(max_rows) - try: - yield - finally: - _row_cap_var.reset(token) - - -async def _paginate( - initial_req: httpx.Request, - *, - parse_response: Callable[[httpx.Response], tuple[pd.DataFrame, _Cursor | None]], - follow_up: Callable[[_Cursor, httpx.AsyncClient], Awaitable[httpx.Response]], - client: httpx.AsyncClient | None = None, -) -> tuple[pd.DataFrame, httpx.Response]: - """ - Drive a paginated request to completion over an - :class:`httpx.AsyncClient`. - - The common shape behind :func:`_walk_pages` and - :func:`get_stats_data`: send the initial request, then loop calling - ``follow_up`` until ``parse_response`` reports a ``None`` cursor, - accumulating frames and elapsed time. Any mid-pagination failure - raises ``DataRetrievalError`` wrapping the cause — the API exposes no - resume cursor, so the caller's only recovery is to retry the whole - call. Issuing HTTP asynchronously lets the multiple sub-requests of a - chunked call run concurrently under - :meth:`~dataretrieval.waterdata.chunking.ChunkedCall._run`. - - Parameters - ---------- - initial_req : httpx.Request - First-page request to send. - parse_response : callable - ``resp -> (df, next_cursor_or_None)``. Returns the page's - DataFrame and the cursor (URL, token, …) used to drive - ``follow_up`` for the next page; ``None`` terminates the loop. - follow_up : callable - ``(cursor, client) -> Awaitable[httpx.Response]``. Builds and - sends the next-page request. - client : httpx.AsyncClient, optional - Caller-borrowed client. ``None`` (default) means use the - chunker's shared client (if inside a chunked call) or open - a temporary one. - - Returns - ------- - df : pandas.DataFrame - Concatenation of every page's parsed frame. - response : httpx.Response - A shallow copy of the first-page response, with ``.headers`` - rebuilt as a fresh ``httpx.Headers`` reflecting the last page and - ``.elapsed`` set to cumulative wall-clock. The canonical URL is - preserved from the first page. The original first-page response - is not mutated. - - Raises - ------ - DataRetrievalError - On a non-200 initial response, the typed subclass for the status from - :func:`_raise_for_non_200` (a - :class:`~dataretrieval.exceptions.TransientError` for a retryable - 429 / 5xx, otherwise a fatal :class:`~dataretrieval.exceptions.HTTPError`); - or, on an initial-page parse failure or any subsequent-page failure, a - base ``DataRetrievalError`` wrapping the cause (built by - :func:`_paginated_failure_message`, original exception on ``__cause__``). - httpx.HTTPError - Network-level failures on the *initial* request (e.g. - ``ConnectError``, ``TimeoutException``) propagate unmodified - so callers can branch on the specific type; equivalent - failures on subsequent pages are wrapped per above. - """ - logger.debug("Requesting: %s", initial_req.url) - reporter = _progress.current() - async with _client_for(client) as sess: - resp = await sess.send(initial_req) - _raise_for_non_200(resp) - initial_response = resp - total_elapsed = _safe_elapsed(resp) - - try: - df, cursor = parse_response(resp) - except Exception as e: # noqa: BLE001 - # Initial-page parse failures (malformed JSON, missing - # ``features``, schema drift) get the same wrapped-message - # treatment as follow-up failures so callers see a consistent - # diagnostic regardless of which page broke. - logger.warning("Initial response parse failed.") - raise DataRetrievalError(_paginated_failure_message(0, e)) from e - dfs = [df] - # Stop following ``next`` links once the optional row cap is reached - # (see :func:`_row_cap`); ``None`` means uncapped. The concatenation - # is sliced to the cap below so a final over-budget page can't exceed it. - cap = _row_cap_var.get() - nrows = len(df) - if reporter is not None: - reporter.set_rate_remaining( - resp.headers.get(_QUOTA_HEADER), - limit=resp.headers.get("x-ratelimit-limit"), - ) - reporter.add_page(rows=len(df)) - while cursor is not None and (cap is None or nrows < cap): - try: - resp = await follow_up(cursor, sess) - _raise_for_non_200(resp) - df, cursor = parse_response(resp) - dfs.append(df) - nrows += len(df) - total_elapsed += _safe_elapsed(resp) - if reporter is not None: - reporter.set_rate_remaining( - resp.headers.get(_QUOTA_HEADER), - limit=resp.headers.get("x-ratelimit-limit"), - ) - reporter.add_page(rows=len(df)) - except Exception as e: # noqa: BLE001 - logger.warning( - "Request failed at cursor %r. Data download interrupted.", - cursor, - ) - raise DataRetrievalError(_paginated_failure_message(len(dfs), e)) from e - - # Aggregate headers / elapsed onto a COPY of the initial - # response so the user's caller never sees an in-place - # mutation of the response object they may have inspected - # mid-pagination via a hook or test fixture. - final_response = _aggregate_paginated_response( - initial_response, resp, total_elapsed - ) - result = pd.concat(dfs, ignore_index=True) - if cap is not None: - result = result.head(cap) - return result, final_response - - -def _ogc_parse_response( - resp: httpx.Response, *, geopd: bool -) -> tuple[pd.DataFrame, str | None]: - """Parse one OGC API page: extract the DataFrame and the next-page URL. - - The parse strategy :func:`_walk_pages` hands to - :func:`_paginate`. Coerces falsy cursors (empty href, etc.) to - ``None`` so the paginate loop's ``while cursor is not None`` - terminates instead of spinning on a meaningless value. - """ - body = resp.json() - return ( - _get_resp_data(resp, geopd=geopd, body=body), - _next_req_url(resp, body=body) or None, - ) - - -async def _walk_pages( - geopd: bool, - req: httpx.Request, - client: httpx.AsyncClient | None = None, -) -> tuple[pd.DataFrame, httpx.Response]: - """ - Iterate paginated OGC API responses asynchronously and aggregate - them into one DataFrame. - - Thin wrapper that hands off to :func:`_paginate` with - OGC-specific strategies: pages are parsed via :func:`_get_resp_data` - (through :func:`_ogc_parse_response`) and the next-page cursor is the - URL from the response's ``links`` array (per :func:`_next_req_url`). - - Parameters - ---------- - geopd : bool - Whether geopandas is installed (drives geometry handling). - req : httpx.Request - The initial HTTP request to send. - client : httpx.AsyncClient, optional - Caller-borrowed client; ``None`` defers client management to - :func:`_paginate`. - - Returns - ------- - pd.DataFrame - A DataFrame containing the aggregated results from all pages. - httpx.Response - Aggregated response — initial-request URL (for query identity), - final page's headers (so downstream sees current rate-limit - state), and cumulative ``elapsed`` summed across pages. - - Raises - ------ - DataRetrievalError - See :func:`_paginate`. - httpx.HTTPError - See :func:`_paginate`. - """ - method = req.method # ``httpx.Request.method`` is already upper-cased. - headers = req.headers - content = req.content if method == "POST" else None - - async def follow_up(cursor: str, sess: httpx.AsyncClient) -> httpx.Response: - return await sess.request(method, cursor, headers=headers, content=content) - - return await _paginate( - req, - parse_response=functools.partial(_ogc_parse_response, geopd=geopd), - follow_up=follow_up, - client=client, - ) - - -def _deal_with_empty( - return_list: pd.DataFrame, properties: list[str] | None, service: str -) -> pd.DataFrame: - """ - Handles empty DataFrame results by returning a DataFrame with appropriate columns. - - If `return_list` is empty, determines the column names to use: - - If `properties` is not provided or contains only NaN values, - retrieves schema properties from the specified service. - - Otherwise, uses the provided `properties` list as column names. - - Parameters - ---------- - return_list : pd.DataFrame - The DataFrame to check for emptiness. - properties : Optional[List[str]] - List of property names to use as columns, or None. - service : str - The service endpoint to query for schema properties if needed. - - Returns - ------- - pd.DataFrame - The original DataFrame if not empty, otherwise an empty - DataFrame with the appropriate columns. - """ - if return_list.empty: - if not properties or all(pd.isna(properties)): - schema = _check_ogc_requests(endpoint=service, req_type="schema") - properties = list(schema.get("properties", {}).keys()) - return pd.DataFrame(columns=properties) - return return_list +"""Water Data API layer over the generic OGC engine. + +The API-agnostic OGC machinery (request construction, pagination, response +shaping, the chunked ``get_ogc_data`` entry point) lives in +:mod:`dataretrieval.ogc.engine`. This module is the Water-Data-specific layer +on top of it: it supplies the service-to-id map, the CQL2/date-only dialect, +profile validation, and a thin ``get_ogc_data`` wrapper that injects the +Water Data defaults. (The statistics path lives in its own +:mod:`dataretrieval.waterdata.stats` module.) Every engine symbol the Water Data +getters (``api.py``, ``ratings.py``, ``nearest.py``) and the test suite import +from here is re-exported below. +""" +from __future__ import annotations -def _arrange_cols( - df: pd.DataFrame, properties: list[str] | None, output_id: str -) -> pd.DataFrame: - """ - Rearranges and renames columns in a DataFrame based on provided - properties and the service output id. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame whose columns are to be rearranged or renamed. - properties : Optional[List[str]] - A list of column names to possibly rename. If None or contains - only NaN, the function renames 'id' to output_id. - output_id : str - The name to which the 'id' column should be renamed if applicable. - - Returns - ------- - pd.DataFrame or gpd.GeoDataFrame - The DataFrame with columns rearranged and/or renamed according - to the specified properties and output_id. - """ - - # Rename id column to output_id - df = df.rename(columns={"id": output_id}) - - if properties and not all(pd.isna(properties)): - # Don't alias the caller's list — we mutate below. - local_properties = list(properties) - if "geometry" in df.columns and "geometry" not in local_properties: - local_properties.append("geometry") - # 'id' is a valid service column, but expose it under the - # service-specific output_id name instead. - if "id" in local_properties: - local_properties[local_properties.index("id")] = output_id - df = df.loc[:, [col for col in local_properties if col in df.columns]] - - # Move meaningless-to-user, extra id columns to the end - # of the dataframe, if they exist - extra_id_col = set(df.columns).intersection(_EXTRA_ID_COLS) - - # If the arbitrary id column is returned (either due to properties - # being none or NaN), then move it to the end of the dataframe, but - # if part of properties, keep in requested order - if extra_id_col and (properties is None or all(pd.isna(properties))): - id_col_order = [col for col in df.columns if col not in extra_id_col] + list( - extra_id_col - ) - df = df.loc[:, id_col_order] - - return df - - -def _type_cols(df: pd.DataFrame) -> pd.DataFrame: - """ - Casts columns into appropriate types. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing water data. +from typing import Any, get_args - Returns - ------- - pd.DataFrame - The DataFrame with columns cast to appropriate types. +import httpx +import pandas as pd - """ - cols = set(df.columns) - numerical_cols = [ - "altitude", - "altitude_accuracy", - "contributing_drainage_area", - "drainage_area", - "hole_constructed_depth", - "value", - "well_constructed_depth", - ] - time_cols = [ - "begin", - "begin_utc", - "construction_date", - "end", - "end_utc", - "last_modified", - "time", - ] +from dataretrieval.codes.states import to_state +from dataretrieval.ogc import engine +from dataretrieval.ogc.engine import ( + _DATE_RANGE_PARAMS, + _DURATION_RE, + BASE_URL, + GEOPANDAS, + OGC_API_URL, + OgcDialect, + _arrange_cols, + _as_str_list, + _check_id_format, + _check_monitoring_location_id, + _check_ogc_requests, + _construct_api_requests, + _construct_cql_request, + _deal_with_empty, + _default_headers, + _error_body, + _format_api_dates, + _get_resp_data, + _next_req_url, + _normalize_str_iterable, + _paginate, + _paginated_failure_message, + _parse_retry_after, + _raise_for_non_200, + _row_cap, + _run_sync, + _switch_properties_id, + _to_snake_case, + _walk_pages, +) +from dataretrieval.ogc.engine import ( + _get_args as _engine_get_args, +) +from dataretrieval.utils import BaseMetadata +from dataretrieval.waterdata.types import ( + PROFILE_LOOKUP, + PROFILES, + SERVICES, +) - for col in cols.intersection(time_cols): - df[col] = pd.to_datetime(df[col], errors="coerce") +SAMPLES_URL = f"{BASE_URL}/samples-data" - for col in cols.intersection(numerical_cols): - df[col] = pd.to_numeric(df[col], errors="coerce") +# Maps each OGC waterdata service to its user-facing ``id`` column (the name the +# typed getters rename the wire ``id`` to, e.g. ``daily`` -> ``daily_id``). +# ``get_cql`` validates its ``service`` argument against these keys and +# uses the value as the ``output_id`` for result shaping. Keep in sync with the +# ``types.WATERDATA_SERVICES`` Literal (same keys). +_OUTPUT_ID_BY_SERVICE: dict[str, str] = { + "channel-measurements": "channel_measurements_id", + "combined-metadata": "combined_meta_id", + "continuous": "continuous_id", + "daily": "daily_id", + "field-measurements": "field_measurement_id", + "field-measurements-metadata": "field_series_id", + "latest-continuous": "latest_continuous_id", + "latest-daily": "latest_daily_id", + "monitoring-locations": "monitoring_location_id", + "peaks": "peak_id", + "time-series-metadata": "time_series_id", +} - return df +# Every service's output id EXCEPT the two that are genuinely user-facing +# (``monitoring_location_id`` and ``time_series_id``). The rest are synthetic +# per-record ids that ``_arrange_cols`` moves to the end of a result frame. +# Derived from ``_OUTPUT_ID_BY_SERVICE`` so adding a service can't silently +# leave a stray id column at the front again. +_EXTRA_ID_COLS = frozenset( + set(_OUTPUT_ID_BY_SERVICE.values()) - {"monitoring_location_id", "time_series_id"} +) +# The Water Data API dialect: ``monitoring-locations`` doesn't accept +# comma-separated multi-value GET params (so it must POST CQL2 JSON), +# ``daily`` renders its time arguments date-only (``YYYY-MM-DD``), and the +# ``time_cols``/``numerical_cols``/``sort_cols`` are the Water-Data column +# vocabulary used to coerce datetime/numeric columns and to sort results. +WATERDATA_DIALECT = OgcDialect( + cql2_services=frozenset({"monitoring-locations"}), + date_only_services=frozenset({"daily"}), + time_cols=frozenset( + { + "begin", + "begin_utc", + "construction_date", + "end", + "end_utc", + "last_modified", + "time", + } + ), + numerical_cols=frozenset( + { + "altitude", + "altitude_accuracy", + "contributing_drainage_area", + "drainage_area", + "hole_constructed_depth", + "value", + "well_constructed_depth", + } + ), + sort_cols=("time", "monitoring_location_id"), +) -def _sort_rows(df: pd.DataFrame) -> pd.DataFrame: - """ - Sorts rows by 'time' and 'monitoring_location_id' columns if they - exist. +# Iterable-shaped params that ``_get_args`` must NOT push through +# ``_normalize_str_iterable`` (scalar non-string knobs are caught by runtime +# type, so only iterables with special handling need to be named here): +# - date-range params may contain ``pd.NaT``/None or interval strings +# - ``bbox``/``boundingBox`` are ``list[float]``, sometimes ``numpy.ndarray`` +# - ``get_peaks``'s int-valued filters (``water_year`` etc.) are ``list[int]`` +# - ``get_combined_metadata``'s ``thresholds`` is ``list[float]`` +_NO_NORMALIZE_PARAMS = _DATE_RANGE_PARAMS | { + "bbox", + "boundingBox", + "water_year", + "year", + "month", + "day", + "peak_since", + "thresholds", +} - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing water data. - Returns - ------- - pd.DataFrame - The DataFrame with rows ordered by time and site. +def _get_args( + local_vars: dict[str, Any], exclude: set[str] | None = None +) -> dict[str, Any]: + """Water-Data wrapper over :func:`engine._get_args`. + Supplies the Water Data API's extended ``no_normalize`` set (numeric + params such as ``water_year``, ``thresholds``, ``boundingBox``) so they + keep their element types. See :func:`engine._get_args` for the full + normalization contract. """ - if "time" in df.columns and "monitoring_location_id" in df.columns: - df = df.sort_values(by=["time", "monitoring_location_id"], ignore_index=True) - elif "time" in df.columns: - df = df.sort_values(by="time", ignore_index=True) - - return df - - -def _finalize_ogc( - frame: pd.DataFrame, - response: httpx.Response, - *, - properties: list[str] | None, - output_id: str, - convert_type: bool, - service: str, - max_rows: int | None = None, -) -> tuple[pd.DataFrame, BaseMetadata]: - """Shape a combined OGC result into the user-facing ``(df, md)``. + return _engine_get_args(local_vars, exclude, no_normalize=_NO_NORMALIZE_PARAMS) - The single home for the OGC getters' result shaping: empties - normalized, types coerced (when ``convert_type``), the wire ``id`` - renamed and columns ordered, rows sorted, optionally truncated to - ``max_rows``, and the response wrapped as - :class:`~dataretrieval.utils.BaseMetadata`. - Injected into the chunker as its ``finalize`` hook (see - :data:`~dataretrieval.waterdata.chunking._Finalize`) so the - un-interrupted return *and* a resumed ``ChunkInterrupted.call.resume()`` - produce the same shape — closing the gap where resume used to hand back - the chunker's raw frame and bare ``httpx.Response``. +def _with_state(local_vars: dict[str, Any], *, to: str, into: str) -> dict[str, Any]: + """Resolve the unified ``state`` argument into an endpoint's native state + queryable, returning the (mutated) args mapping. - ``max_rows`` is applied here (after dedup/sort, on the *combined* frame) - rather than only per-sub-request, so a chunked call's total is bounded - to exactly ``max_rows`` and a resumed call honors the cap too — the - per-``_paginate`` ``_row_cap`` is only an early-stop download bound. + ``state`` is the canonical, format-flexible parameter (full name / postal / + FIPS); it is normalized via :func:`~dataretrieval.codes.states.to_state` to + the ``to`` representation and stored under ``into`` (the queryable this + endpoint actually filters on). It is additive sugar over the native + ``state_code`` / ``state_name`` parameters, which still accept the API's + raw values (e.g. non-US FIPS); passing ``state`` together with either + raises ``ValueError``. """ - frame = _deal_with_empty(frame, properties, service) - if convert_type: - frame = _type_cols(frame) - frame = _arrange_cols(frame, properties, output_id) - frame = _sort_rows(frame) - if max_rows is not None: - frame = frame.head(max_rows) - return frame, BaseMetadata(response) + state = local_vars.pop("state", None) + if state is None: + return local_vars + if any(local_vars.get(p) is not None for p in ("state_code", "state_name")): + raise ValueError("Pass `state`, or state_code/state_name, but not both.") + local_vars[into] = to_state(state, to) + return local_vars def get_ogc_data( @@ -1455,21 +183,20 @@ def get_ogc_data( output_id: str | None = None, max_rows: int | None = None, ) -> tuple[pd.DataFrame, BaseMetadata]: - """ - Retrieves OGC (Open Geospatial Consortium) data from a specified - endpoint and returns it as a pandas DataFrame with metadata. + """Water-Data wrapper over :func:`engine.get_ogc_data`. - This function prepares request arguments, constructs API requests, - handles pagination, processes the results, and formats output - according to the specified parameters. + Defaults ``output_id`` from the Water Data service map when not given, + and supplies the Water Data extra-id columns and dialect, so the typed + getters in ``api.py`` call this unchanged. (Sibling OGC APIs such as + NGWMN call ``engine.get_ogc_data`` directly with their own base URL and + dialect rather than going through this Water Data wrapper.) Parameters ---------- args : Dict[str, Any] Dictionary of request arguments for the OGC service. service : str - The OGC API collection name (e.g., ``"daily"``, - ``"monitoring-locations"``, ``"continuous"``). + The OGC API collection name (e.g., ``"daily"``). output_id : str, optional The user-facing id column the wire ``id`` is renamed to. Defaults to ``_OUTPUT_ID_BY_SERVICE[service]``; pass it explicitly only for @@ -1477,8 +204,7 @@ def get_ogc_data( max_rows : int, optional Stop paginating once this many rows have been collected and truncate the result to exactly ``max_rows``. ``None`` (default) - fetches the full result. Intended for cheap previews of large, - un-chunked tables (e.g. :func:`get_reference_table`). + fetches the full result. Returns ------- @@ -1486,360 +212,48 @@ def get_ogc_data( A DataFrame containing the retrieved and processed OGC data. BaseMetadata A metadata object containing request information including URL and query time. - - Notes - ----- - - The function does not mutate the input `args` dictionary. - - Handles optional arguments such as `convert_type`. - - Applies column cleanup and reordering based on service and properties. """ - # Enforce a genuine positive integer: a float (even ``10.0``) or ``bool`` - # would pass a bare ``< 1`` check and then crash deep in - # ``pd.DataFrame.head`` with an opaque ``TypeError`` after HTTP I/O has - # already fired. ``numbers.Integral`` (not ``int``) so numpy integers — - # e.g. ``max_rows`` derived from a numpy/pandas computation — are accepted; - # ``bool`` is an ``Integral`` subtype, so exclude it explicitly. - if max_rows is not None and ( - not isinstance(max_rows, numbers.Integral) - or isinstance(max_rows, bool) - or max_rows < 1 - ): - raise ValueError(f"max_rows must be a positive integer (got {max_rows!r}).") - - # Each service renames its wire ``id`` to a service-specific column; that - # name is derived from ``service`` via the canonical map so the getters - # don't each repeat it. Callers for collections outside the map (e.g. - # get_reference_table's metadata collections) pass output_id explicitly. if output_id is None: output_id = _OUTPUT_ID_BY_SERVICE[service] - - args = args.copy() - args["service"] = service - args = _switch_arg_id(args, id_name=output_id, service=service) - # Capture `properties` before the id-switch so post-processing sees - # the user-facing names, not the wire-format ones. - properties = args.get("properties") - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) - convert_type = args.pop("convert_type", False) - args = {k: v for k, v in args.items() if v is not None} - - # Post-processing is injected into the chunker rather than applied here, - # so it runs on *every* exit: the normal return AND a later - # ``exc.call.resume()`` after a ChunkInterrupted (which never re-enters - # this function). ``_finalize_ogc`` is the single source of result shape; - # it also applies ``max_rows`` to the *combined* frame so the cap is the - # exact total even when the plan chunks or the call is resumed, while - # ``_row_cap`` below only early-stops each sub-request's pagination. - finalize = functools.partial( - _finalize_ogc, - properties=properties, - output_id=output_id, - convert_type=convert_type, - service=service, + return engine.get_ogc_data( + args, + service, + output_id, max_rows=max_rows, + base_url=OGC_API_URL, + extra_id_cols=_EXTRA_ID_COLS, + dialect=WATERDATA_DIALECT, ) - with _progress.progress_context(service=service), _row_cap(max_rows): - return _fetch_once(args, finalize=finalize) - - -@chunking.multi_value_chunked(build_request=_construct_api_requests) -async def _fetch_once( - args: dict[str, Any], -) -> tuple[pd.DataFrame, httpx.Response]: - """Send one prepared-args OGC request asynchronously; return the - frame + response. - - ``@chunking.multi_value_chunked`` models every multi-value list - parameter and the cql-text filter as a chunkable axis, greedy-halves - the biggest chunk across all axes until each sub-request URL fits, - and iterates the cartesian product. With no chunkable inputs the - decorator passes args through unchanged. The decorator gathers every - sub-request over one shared :class:`httpx.AsyncClient` (concurrency - bounded by a semaphore, sized from ``API_USGS_CONCURRENT``) - and returns a *synchronous* wrapper, so ``get_ogc_data`` keeps calling - ``_fetch_once(args, finalize=...)`` synchronously. The return shape is - ``(frame, response)``. - """ - req = _construct_api_requests(**args) - return await _walk_pages(geopd=GEOPANDAS, req=req) - - -def _handle_stats_nesting( - body: dict[str, Any], - geopd: bool = False, -) -> pd.DataFrame: - """ - Takes nested json from stats service and flattens into a dataframe with - one row per monitoring location, parameter, and statistic. - - Parameters - ---------- - body : Dict[str, Any] - The JSON response body from the statistics service containing nested data. - geopd : bool, optional - Whether ``geopandas`` is available — when ``True`` the returned - frame is a ``GeoDataFrame``; when ``False`` (default) a plain - ``pd.DataFrame`` is returned with geometry flattened. - - Returns - ------- - pd.DataFrame - A DataFrame containing the flattened statistical data. - - Notes - ----- - The non-geopandas branch uses the same schema-aware extraction as - :func:`_get_resp_data`: it builds the per-feature outer frame - directly from each feature's ``properties`` (minus the nested - ``data`` field, which is unrolled separately below via the - ``record_path`` json_normalize), then adds ``geometry`` only when - present. Unlike :func:`_get_resp_data`, no top-level ``id`` column - is added — stats features don't carry one, so this matches the - geopandas branch. Skipping the GeoJSON envelope keeps newly-added - fields like ``geometry.type`` from leaking into the result. - """ - if body is None: - return gpd.GeoDataFrame() if geopd else pd.DataFrame() - - # An empty (or missing) features list — a real mid-pagination - # shape — would otherwise crash the downstream merge with - # ``KeyError: 'monitoring_location_id'`` because neither df nor - # dat would carry the merge key. Bail out with an empty frame — - # ``GeoDataFrame`` when geopd is available so the eventual - # ``pd.concat`` with non-empty geo pages doesn't downgrade to a - # plain DataFrame and strip geometry/CRS. - features = body.get("features") or [] - if not features: - return gpd.GeoDataFrame() if geopd else pd.DataFrame() - - # The geopd-missing warning is emitted once at import (see top of module); - # doing it here would log per page. - if not geopd: - outer_props = [ - {k: v for k, v in (f.get("properties") or {}).items() if k != "data"} - for f in features - ] - df = pd.json_normalize(outer_props, sep=".") - df.columns = df.columns.str.split(".").str[-1] - # Stats features don't carry a top-level ``id`` field — the - # geopandas branch (``GeoDataFrame.from_features``) doesn't - # surface one either, so the non-geopd branch stays - # consistent by NOT adding an id column. - geoms = [(f.get("geometry") or {}).get("coordinates") for f in features] - if any(g is not None for g in geoms): - df["geometry"] = geoms - else: - df = gpd.GeoDataFrame.from_features(features).drop( - columns=["data"], errors="ignore" - ) - - # Unnest json features, properties, data, and values while retaining necessary - # metadata to merge with main dataframe. - dat = pd.json_normalize( - body, - record_path=["features", "properties", "data", "values"], - meta=[ - ["features", "properties", "monitoring_location_id"], - ["features", "properties", "data", "parameter_code"], - ["features", "properties", "data", "unit_of_measure"], - ["features", "properties", "data", "parent_time_series_id"], - ], - meta_prefix="", - errors="ignore", - ) - dat.columns = dat.columns.str.split(".").str[-1] - - return df.merge(dat, on="monitoring_location_id", how="left") - - -def _expand_percentiles(df: pd.DataFrame) -> pd.DataFrame: - """ - Takes percentile value and thresholds columns containing lists - of values and turns each list element into its own row in the - original dataframe. Exploded ``'nan'`` values are dropped. If - no percentile data exist, it adds a percentile column and - populates it with the percentile assigned to min, max, and - median. - - Parameters - ---------- - df : pd.DataFrame - The dataframe returned from using one of the statistics services. - - Returns - ------- - pd.DataFrame - A DataFrame containing the flattened percentile data. - """ - if len(df) > 0: - if "percentile" in df["computation"].unique(): - # Explode percentile lists into rows called "value" and "percentile" - percentiles = df.loc[df["computation"] == "percentile"] - percentiles_explode = percentiles[ - ["computation_id", "values", "percentiles"] - ].explode(["values", "percentiles"], ignore_index=True) - percentiles_explode = percentiles_explode.loc[ - percentiles_explode["values"] != "nan" - ] - percentiles_explode["value"] = pd.to_numeric(percentiles_explode["values"]) - percentiles_explode["percentile"] = pd.to_numeric( - percentiles_explode["percentiles"] - ) - percentiles_explode = percentiles_explode.drop( - columns=["values", "percentiles"] - ) - - # Merge exploded values back to other metadata/geometry - percentiles = percentiles.drop( - columns=["values", "percentiles", "value"], errors="ignore" - ).merge(percentiles_explode, on="computation_id", how="left") - - # Concatenate back to original - dfs = pd.concat( - [df.loc[df["computation"] != "percentile"], percentiles] - ).drop(columns=["values", "percentiles"]) - else: - dfs = df - dfs["percentile"] = pd.NA - # Give min, max, median a percentile value - dfs.loc[dfs["computation"] == "maximum", "percentile"] = 100 - dfs.loc[dfs["computation"] == "minimum", "percentile"] = 0 - dfs.loc[dfs["computation"] == "median", "percentile"] = 50 - # Make sure numeric - dfs["percentile"] = pd.to_numeric(dfs["percentile"]) - - # Move percentile column - cols = dfs.columns.tolist() - cols.remove("percentile") - col_index = cols.index("value") + 1 - cols.insert(col_index, "percentile") - - return dfs[cols] - - else: - return df - - -def _run_sync( - make_coro: Callable[[], Awaitable[tuple[pd.DataFrame, httpx.Response]]], +def _finalize_ogc( + frame: pd.DataFrame, + response: httpx.Response, *, + properties: list[str] | None, + output_id: str, + convert_type: bool, service: str, -) -> tuple[pd.DataFrame, httpx.Response]: - """Drive an async OGC fetch to completion from synchronous code. - - Opens the service progress context and runs ``make_coro()`` through a - short-lived ``anyio`` blocking portal (a worker thread), so the - non-chunked getters work whether or not the caller is already inside an - event loop (Jupyter/async apps). The portal copies the calling context, - so the active progress reporter still reaches the sub-requests. - - Shared by the non-chunked fetch paths (:func:`get_stats_data`, - :func:`get_cql`); the chunked OGC getters drive their own portal - inside :meth:`chunking.ChunkedCall.resume`. - """ - with _progress.progress_context(service=service): - with start_blocking_portal() as portal: - try: - return portal.call(make_coro) - except httpx.TransportError as exc: - # The initial-request connection failure ``_paginate`` lets - # through raw; mid-pagination failures are already typed. - raise _network_error(OGC_API_URL, exc) from exc - - -def get_stats_data( - args: dict[str, Any], - service: str, - expand_percentiles: bool, - client: httpx.AsyncClient | None = None, + max_rows: int | None = None, ) -> tuple[pd.DataFrame, BaseMetadata]: - """ - Retrieves statistical data from a specified endpoint and returns it - as a pandas DataFrame with metadata. - - This function prepares request arguments, constructs API requests, - handles pagination, processes results, and formats output according - to the specified parameters. - - The stats path doesn't go through ``multi_value_chunked`` (its query - shape has no chunkable list axes), so it drives :func:`_paginate` - directly through an ``anyio`` blocking portal. The portal runs the - pagination loop in a short-lived worker thread, so this works whether - or not the caller is already inside an event loop. - - Parameters - ---------- - args : Dict[str, Any] - Dictionary of request arguments for the statistics service. - service : str - The statistics service type (for example, - "observationNormals" or "observationIntervals"). - expand_percentiles : bool - Determines whether the percentiles column is expanded so that - each percentile gets its own row in the returned dataframe. If - True and the user requests a computation_type other than - percentiles, a percentile column is still returned. - client : httpx.AsyncClient, optional - Caller-borrowed async client. ``None`` (default) opens a - temporary one inside the portal. Primarily a test seam. - - Returns - ------- - pd.DataFrame - A DataFrame containing the retrieved and processed statistical data. - BaseMetadata - A metadata object containing request information including URL and query time. + """Water-Data wrapper over :func:`engine._finalize_ogc`. - Raises - ------ - DataRetrievalError - The typed subclass for an HTTP error response (see :func:`_paginate`); - or :class:`~dataretrieval.exceptions.NetworkError` if the initial request - can't reach the service (timeout / DNS), the ``httpx`` exception chained - on ``__cause__``. + Injects the Water Data ``extra_id_cols`` and ``dialect`` so a direct + call (e.g. from ``get_cql``) orders synthetic id columns and coerces/ + sorts result columns identically to the typed getters. See + :func:`engine._finalize_ogc` for the full result-shaping contract. """ - - url = f"{STATISTICS_API_URL}/{service}" - req = httpx.Request( - method="GET", - url=url, - headers=_default_headers(), - params=args, + return engine._finalize_ogc( + frame, + response, + properties=properties, + output_id=output_id, + convert_type=convert_type, + service=service, + max_rows=max_rows, + extra_id_cols=_EXTRA_ID_COLS, + dialect=WATERDATA_DIALECT, ) - method = req.method - headers = req.headers - - def parse_response(resp: httpx.Response) -> tuple[pd.DataFrame, str | None]: - body = resp.json() - # Coerce falsy cursors ("", 0) to None so _paginate terminates. - # USGS uses "next": null at end-of-stream, but defensive coerce - # protects against any "" sentinel a future schema might use. - return _handle_stats_nesting(body, geopd=GEOPANDAS), body.get("next") or None - - async def follow_up(cursor: str, sess: httpx.AsyncClient) -> httpx.Response: - # Build a fresh params dict per page so the caller's ``args`` - # is never mutated. - return await sess.request( - method, url=url, params={**args, "next_token": cursor}, headers=headers - ) - - async def _run() -> tuple[pd.DataFrame, httpx.Response]: - return await _paginate( - req, - parse_response=parse_response, - follow_up=follow_up, - client=client, - ) - - df, response = _run_sync(_run, service=service) - - if expand_percentiles: - df = _expand_percentiles(df) - return df, BaseMetadata(response) def _check_profiles( @@ -1871,212 +285,42 @@ def _check_profiles( ) -_MONITORING_LOCATION_ID_RE = re.compile(r"[^-\s]+-[^-\s]+") - - -# Iterable-shaped params that ``_get_args`` must NOT push through -# ``_normalize_str_iterable`` (scalar non-string knobs are caught by runtime -# type, so only iterables with special handling need to be named here): -# - date-range params may contain ``pd.NaT``/None or interval strings -# - ``bbox``/``boundingBox`` are ``list[float]``, sometimes ``numpy.ndarray`` -# - ``get_peaks``'s int-valued filters (``water_year`` etc.) are ``list[int]`` -# - ``get_combined_metadata``'s ``thresholds`` is ``list[float]`` -_NO_NORMALIZE_PARAMS = _DATE_RANGE_PARAMS | { - "bbox", - "boundingBox", - "water_year", - "year", - "month", - "day", - "peak_since", - "thresholds", -} - - -def _normalize_str_iterable( - value: str | Iterable[str] | None, - param_name: str = "value", -) -> str | list[str] | None: - """Validate that ``value`` is None, a string, or an iterable of strings. - - Non-string iterables (``list``, ``tuple``, ``pandas.Series``, - ``pandas.Index``, ``numpy.ndarray``, generators) are materialized to a - ``list`` so downstream code that branches on ``isinstance(v, (list, - tuple))`` keeps working. ``Mapping`` types are rejected because - iterating a mapping yields keys, not values. - - Parameters - ---------- - value : None, str, or iterable of str - param_name : str, optional - Used in error messages. Defaults to ``"value"``. - - Returns - ------- - None, str, or list of str - - Raises - ------ - TypeError - If the input isn't ``None``, ``str``, or a non-``Mapping`` - iterable; or if any iterable element isn't a string. - """ - if value is None: - return None - if isinstance(value, str): - return value - if isinstance(value, Mapping) or not isinstance(value, Iterable): - raise TypeError( - f"{param_name} must be a string or iterable of strings, " - f"not {type(value).__name__} (got {value!r})." - ) - values: list[str] = [] - for v in value: - if not isinstance(v, str): - raise TypeError( - f"{param_name} elements must be strings, " - f"not {type(v).__name__} (got {v!r})." - ) - values.append(v) - return values - - -def _as_str_list( - value: str | Iterable[str] | None, - param_name: str = "value", -) -> list[str] | None: - """Normalize ``value`` to ``list[str]`` (``None`` passes through). - - Wraps a bare ``str`` in a single-element list — so a later - ``",".join(...)`` doesn't iterate it character-by-character — and - materializes any other iterable via :func:`_normalize_str_iterable`. - """ - normalized = _normalize_str_iterable(value, param_name) - if isinstance(normalized, str): - return [normalized] - return normalized - - -def _check_monitoring_location_id( - monitoring_location_id: str | Iterable[str] | None, -) -> str | list[str] | None: - """Validate and normalize a ``monitoring_location_id`` value. - - Combines :func:`_normalize_str_iterable` with the AGENCY-ID format - check that is unique to ``monitoring_location_id`` (the OGC spec - requires a hyphen separator, e.g. ``USGS-01646500``). - - Parameters - ---------- - monitoring_location_id : None, str, or iterable of str - See :func:`_normalize_str_iterable`. Each string is additionally - required to match the AGENCY-ID hyphen-separated format. - - Returns - ------- - None, str, or list of str - - Raises - ------ - TypeError - If the input isn't ``None``, ``str``, or a non-``Mapping`` - iterable; or if any iterable element isn't a string. - ValueError - If any identifier doesn't contain a hyphen separator - (per the OGC API spec: AGENCY-ID format, e.g. ``USGS-01646500``). - """ - try: - value = _normalize_str_iterable( - monitoring_location_id, "monitoring_location_id" - ) - except TypeError as exc: - # Re-raise with the AGENCY-ID hint the generic helper doesn't carry. - raise TypeError( - f"{exc} Expected 'AGENCY-ID' format, e.g., 'USGS-01646500'." - ) from None - if value is None: - return None - for item in (value,) if isinstance(value, str) else value: - _check_id_format(item) - return value - - -def _check_id_format(value: str) -> None: - """Raise ``ValueError`` if ``value`` is not in ``AGENCY-ID`` format.""" - if not _MONITORING_LOCATION_ID_RE.fullmatch(value): - raise ValueError( - f"Invalid monitoring_location_id: {value!r}. " - f"Expected 'AGENCY-ID' format, e.g., 'USGS-01646500'." - ) - - -def _get_args( - local_vars: dict[str, Any], exclude: set[str] | None = None -) -> dict[str, Any]: - """ - Build the API-request kwargs dict from a getter's ``locals()``. - - Drops bookkeeping keys (``service``, ``output_id``, anything in - ``exclude``) and ``None``-valued kwargs, then normalizes the - remaining values: - - - ``monitoring_location_id`` is validated against the AGENCY-ID - format (per :func:`_check_monitoring_location_id`). - - ``properties`` is materialized to ``list[str]`` (a bare string - gets wrapped in a single-element list so downstream - ``",".join(properties)`` doesn't iterate per character). - - A non-string iterable in ``_NO_NORMALIZE_PARAMS`` (numeric params - such as ``water_year``, ``bbox``, ``thresholds``) is materialized - to a ``list`` with its element types preserved (no string - normalization), so the GET comma-join and the chunker — which test - ``list``/``tuple`` — handle it instead of ``str()``-ing the whole - array. - - Any other ``Iterable[str]`` (i.e. not in ``_NO_NORMALIZE_PARAMS``) - is materialized to ``list[str]`` via - :func:`_normalize_str_iterable` so downstream code that branches - on ``isinstance(v, (list, tuple))`` works for ``pandas.Series``, - ``numpy.ndarray``, generators, etc. - - Scalars and strings pass through unchanged. - - Parameters - ---------- - local_vars : dict[str, Any] - Dictionary of local variables, typically from ``locals()``. - exclude : set[str], optional - Additional keys to exclude from the resulting dictionary. - - Returns - ------- - dict[str, Any] - Filtered and normalized arguments for API requests. - """ - to_exclude = {"service", "output_id"} - if exclude: - to_exclude.update(exclude) - - args: dict[str, Any] = {} - for k, v in local_vars.items(): - if k in to_exclude or v is None: - continue - if k == "monitoring_location_id": - args[k] = _check_monitoring_location_id(v) - elif k == "properties": - args[k] = _as_str_list(v, k) - elif ( - k in _NO_NORMALIZE_PARAMS - and isinstance(v, Iterable) - and not isinstance(v, str) - ): - # Numeric params (water_year, bbox, thresholds, …) keep their - # element types — no string-normalization — but a non-string - # iterable (numpy array, pandas Series, generator) is materialized - # to a list so the GET comma-join and the chunker, which test - # ``list``/``tuple``, handle it instead of str()-ing the whole - # array. ``.tolist()`` yields native int/float; ``list()`` covers - # generators and other iterables. Scalars/strings fall through. - args[k] = v.tolist() if hasattr(v, "tolist") else list(v) - elif isinstance(v, str) or not isinstance(v, Iterable): - args[k] = v - else: - args[k] = _normalize_str_iterable(v, k) - return args +__all__ = [ + "BASE_URL", + "GEOPANDAS", + "OGC_API_URL", + "SAMPLES_URL", + "WATERDATA_DIALECT", + "_DATE_RANGE_PARAMS", + "_DURATION_RE", + "_EXTRA_ID_COLS", + "_NO_NORMALIZE_PARAMS", + "_OUTPUT_ID_BY_SERVICE", + "_arrange_cols", + "_as_str_list", + "_check_id_format", + "_check_monitoring_location_id", + "_check_ogc_requests", + "_check_profiles", + "_construct_api_requests", + "_construct_cql_request", + "_deal_with_empty", + "_default_headers", + "_error_body", + "_finalize_ogc", + "_format_api_dates", + "_get_args", + "_get_resp_data", + "_next_req_url", + "_normalize_str_iterable", + "_paginate", + "_paginated_failure_message", + "_parse_retry_after", + "_raise_for_non_200", + "_row_cap", + "_run_sync", + "_switch_properties_id", + "_to_snake_case", + "_walk_pages", + "get_ogc_data", +] diff --git a/demos/USGS_NGWMN_Examples.ipynb b/demos/USGS_NGWMN_Examples.ipynb new file mode 100644 index 00000000..98e6e227 --- /dev/null +++ b/demos/USGS_NGWMN_Examples.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# National Ground-Water Monitoring Network (NGWMN)\n", + "\n", + "The [National Ground-Water Monitoring Network](https://cida.usgs.gov/ngwmn/) (NGWMN)\n", + "brings groundwater data from many state, federal, and local agencies into a single\n", + "location. USGS exposes it through a dedicated OGC API\n", + "(`https://api.waterdata.usgs.gov/ngwmn/ogcapi`), which `dataretrieval` wraps in the\n", + "`dataretrieval.ngwmn` module — a sibling of `dataretrieval.waterdata` built on the\n", + "same shared OGC engine, so chunking, pagination, and result shaping behave the same.\n", + "\n", + "There are five getters:\n", + "\n", + "| Function | Description |\n", + "| --- | --- |\n", + "| `get_sites` | Monitoring-location (well) metadata |\n", + "| `get_water_level` | Water-level observations |\n", + "| `get_lithology` | Lithology (geologic material) logs |\n", + "| `get_well_construction` | Well-construction records |\n", + "| `get_providers` | Contributing data providers |\n", + "\n", + "Unlike the main Water Data collections, NGWMN aggregates locations from many\n", + "agencies, so `monitoring_location_id` values use agency prefixes besides `USGS-`\n", + "(e.g. `MBMG-702934`, `AKDNR-535134236016630`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "from dataretrieval import ngwmn" + ] + }, + { + "cell_type": "markdown", + "id": "2", + "metadata": {}, + "source": [ + "## Providers\n", + "\n", + "List the organizations contributing data, optionally filtered by state." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "providers, md = ngwmn.get_providers(state=\"WI\")\n", + "print(f\"{len(providers)} providers in WI\")\n", + "providers.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4", + "metadata": {}, + "source": [ + "## Sites\n", + "\n", + "`get_sites` returns well metadata. Sites carry geometry by default, so the result is a\n", + "`GeoDataFrame`; pass `skip_geometry=True` to drop it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "sites, md = ngwmn.get_sites(state=\"Wisconsin\")\n", + "print(f\"{len(sites)} NGWMN sites in Wisconsin\")\n", + "sites[[\"monitoring_location_id\", \"monitoring_location_name\", \"national_aquifer_description\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": {}, + "source": [ + "## Water levels\n", + "\n", + "`get_water_level` returns the observations for one or more sites. A two-element\n", + "`datetime=[start, end]` restricts the record to a time window; a list of\n", + "`monitoring_location_id`s fans out across sites and is unioned." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "site = \"USGS-272838082142201\"\n", + "wl, md = ngwmn.get_water_level(monitoring_location_id=site)\n", + "print(f\"{len(wl)} water-level observations at {site}\")\n", + "\n", + "wl[\"sample_time\"] = pd.to_datetime(wl[\"sample_time\"], errors=\"coerce\", utc=True)\n", + "wl = wl.dropna(subset=[\"sample_time\"]).sort_values(\"sample_time\")\n", + "depth = pd.to_numeric(wl[\"water_depth_below_land_surface_ft\"], errors=\"coerce\")\n", + "\n", + "fig, ax = plt.subplots(figsize=(9, 4))\n", + "ax.plot(wl[\"sample_time\"], depth, lw=0.8)\n", + "ax.invert_yaxis() # depth increases downward\n", + "ax.set(xlabel=\"Date\", ylabel=\"Depth to water (ft below land surface)\",\n", + " title=f\"NGWMN water levels \\u2014 {site}\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": {}, + "source": [ + "Restrict to a date range, or query several sites at once (they fan out and\n", + "union):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "windowed, md = ngwmn.get_water_level(\n", + " monitoring_location_id=site, datetime=[\"2022-01-01\", \"2024-01-01\"]\n", + ")\n", + "print(f\"{len(windowed)} observations in 2022\\u20132024\")\n", + "\n", + "multi, md = ngwmn.get_water_level(\n", + " monitoring_location_id=[\"USGS-272838082142201\", \"USGS-404159100494601\"]\n", + ")\n", + "print(f\"{multi['monitoring_location_id'].nunique()} sites, {len(multi)} observations\")" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "## Well construction and lithology\n", + "\n", + "Construction records describe a well's physical build-out; lithology logs describe the\n", + "geologic materials with depth." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "construction, md = ngwmn.get_well_construction(monitoring_location_id=site)\n", + "construction[[\"monitoring_location_obs_number\", \"type\", \"material\", \"depth_from\", \"depth_to\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "lithology, md = ngwmn.get_lithology(monitoring_location_id=\"AKDNR-535134236016630\")\n", + "lithology[[\"lithology_depth_from\", \"lithology_depth_to\", \"lithology_description\"]].head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demos/USGS_WaterData_ContinuousData_Examples.ipynb b/demos/USGS_WaterData_ContinuousData_Examples.ipynb index d843f336..8b2bafc5 100644 --- a/demos/USGS_WaterData_ContinuousData_Examples.ipynb +++ b/demos/USGS_WaterData_ContinuousData_Examples.ipynb @@ -125,7 +125,7 @@ "source": [ "import time\n", "\n", - "from dataretrieval.waterdata.chunking import ChunkInterrupted\n", + "from dataretrieval import ChunkInterrupted\n", "\n", "try:\n", " sensor_data, _ = waterdata.get_continuous(\n", @@ -236,7 +236,7 @@ "## More help\n", "\n", "- Documentation: \n", - "- Chunking and resume internals: `dataretrieval.waterdata.chunking`\n", + "- Chunking and resume internals: `dataretrieval.ogc.chunking`\n", "- Issues / questions: \n", "- Equivalent R article: [Continuous Data](https://doi-usgs.github.io/dataRetrieval/articles/continuous_pr.html)" ] diff --git a/docs/source/examples/USGS_NGWMN_Examples.nblink b/docs/source/examples/USGS_NGWMN_Examples.nblink new file mode 100644 index 00000000..1a5e6127 --- /dev/null +++ b/docs/source/examples/USGS_NGWMN_Examples.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../demos/USGS_NGWMN_Examples.ipynb" +} diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 91d7bd1f..e7c2deb8 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -18,7 +18,7 @@ covers a basic introduction to module functions and usage. USGS Water Data API vignettes ----------------------------- These notebooks are Python ports of the new USGS Water Data API vignettes from -the R `dataRetrieval`_ package. Each introduces a family of ``waterdata`` +the R `dataRetrieval`_ package. Each introduces a family of Water Data API functions and is executed against the live USGS Water Data API. .. _dataRetrieval: https://doi-usgs.github.io/dataRetrieval/ @@ -31,6 +31,7 @@ functions and is executed against the live USGS Water Data API. USGS_WaterData_DailyStatistics_Examples USGS_WaterData_ContinuousData_Examples USGS_WaterData_ReferenceLists_Examples + USGS_NGWMN_Examples Simple uses of the ``dataretrieval`` package -------------------------------------------- diff --git a/docs/source/reference/exceptions.rst b/docs/source/reference/exceptions.rst index 1d8de47e..4514ac43 100644 --- a/docs/source/reference/exceptions.rst +++ b/docs/source/reference/exceptions.rst @@ -6,3 +6,22 @@ dataretrieval.exceptions .. automodule:: dataretrieval.exceptions :members: :show-inheritance: + +Resumable chunk interruptions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These are raised when a transparently-chunked request is interrupted +mid-stream; the completed work is preserved and ``exc.call.resume()`` continues +it. They are defined in ``dataretrieval.ogc.chunking`` (they carry pandas/httpx +state) but are importable from the top level, e.g. +``from dataretrieval import ChunkInterrupted``. + +.. autoclass:: dataretrieval.ChunkInterrupted + :members: + :show-inheritance: + +.. autoclass:: dataretrieval.QuotaExhausted + :show-inheritance: + +.. autoclass:: dataretrieval.ServiceInterrupted + :show-inheritance: diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 43def275..48947ff8 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -9,6 +9,7 @@ API reference exceptions nadp + ngwmn nldi nwis streamstats diff --git a/docs/source/reference/ngwmn.rst b/docs/source/reference/ngwmn.rst new file mode 100644 index 00000000..90668f3b --- /dev/null +++ b/docs/source/reference/ngwmn.rst @@ -0,0 +1,8 @@ +.. _ngwmn: + +dataretrieval.ngwmn +------------------- + +.. automodule:: dataretrieval.ngwmn + :members: + :special-members: diff --git a/docs/source/userguide/errors.rst b/docs/source/userguide/errors.rst index cd81f546..e2dc3ef1 100644 --- a/docs/source/userguide/errors.rst +++ b/docs/source/userguide/errors.rst @@ -82,8 +82,8 @@ condition clears -- only the unfinished sub-requests are re-issued. .. code-block:: python import time + from dataretrieval import ChunkInterrupted from dataretrieval.waterdata import get_daily - from dataretrieval.waterdata.chunking import ChunkInterrupted try: df, md = get_daily(monitoring_location_id=long_list_of_sites) diff --git a/tests/ngwmn_test.py b/tests/ngwmn_test.py new file mode 100644 index 00000000..cd20daaa --- /dev/null +++ b/tests/ngwmn_test.py @@ -0,0 +1,161 @@ +"""Live tests for the NGWMN OGC getters (``dataretrieval.ngwmn``). + +These hit the live NGWMN OGC API (``api.waterdata.usgs.gov/ngwmn/ogcapi``), +mirroring the integration-test style of ``waterdata_test.py``. The +``flaky`` marker only retries transient transport errors, so a real +behavior change still fails on the first run. +""" + +import sys + +import pytest +from pandas import DataFrame + +if sys.version_info < (3, 10): + pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) + +from dataretrieval import ngwmn +from dataretrieval.utils import BaseMetadata + +pytestmark = pytest.mark.flaky( + reruns=2, + reruns_delay=5, + only_rerun=[ + r"(?:RateLimited|RuntimeError):\s*(?:429|5\d\d):", + r"Connect(ion)?Error", + r"ReadTimeout|ConnectTimeout|Timeout", + ], +) + +# A site with water-level, construction, and lithology records (per the R +# dataRetrieval NGWMN examples), plus a non-USGS-agency id to exercise the +# multi-agency identifier format NGWMN uses. +_SITE = "USGS-272838082142201" +_LITH_SITE = "AKDNR-535134236016630" + + +def test_get_sites(): + df, md = ngwmn.get_sites(state="Wisconsin", limit=10) + assert isinstance(df, DataFrame) + assert isinstance(md, BaseMetadata) + assert len(df) > 0 + assert "monitoring_location_id" in df.columns + # All returned sites are in the requested state. + assert df["state_name"].dropna().eq("Wisconsin").all() + # Sites carry geometry by default. + assert "geometry" in df.columns + assert "ngwmn/ogcapi/collections/sites" in str(md.url) + + +def test_get_sites_skip_geometry(): + df, _ = ngwmn.get_sites(monitoring_location_id=_SITE, skip_geometry=True) + assert isinstance(df, DataFrame) + assert "geometry" not in df.columns + + +def test_get_water_level(): + df, md = ngwmn.get_water_level(monitoring_location_id=_SITE) + assert isinstance(df, DataFrame) + assert len(df) > 0 + assert "sample_time" in df.columns + assert (df["monitoring_location_id"] == _SITE).all() + + +def test_get_water_level_datetime_subsets(): + full, _ = ngwmn.get_water_level(monitoring_location_id=_SITE) + windowed, _ = ngwmn.get_water_level( + monitoring_location_id=_SITE, datetime=["2022-01-01", "2024-01-01"] + ) + # A bounded window returns a strict subset of the full record. + assert 0 < len(windowed) < len(full) + + +def test_get_providers(): + df, md = ngwmn.get_providers(state="WI") + assert isinstance(df, DataFrame) + assert len(df) > 0 + assert {"agency_code", "organization_type", "state"}.issubset(df.columns) + # Providers have no geometry. + assert "geometry" not in df.columns + + +def test_get_sites_state_accepts_name_postal_or_fips(): + """The single ``state`` parameter accepts a full name, postal code, or FIPS + code; ``_resolve_state`` normalizes all three to the full ``state_name`` the + ``sites`` collection queries on, so every encoding returns the same sites.""" + by_name, _ = ngwmn.get_sites(state="Wisconsin", skip_geometry=True) + by_postal, _ = ngwmn.get_sites(state="WI", skip_geometry=True) + by_fips, _ = ngwmn.get_sites(state="55", skip_geometry=True) + assert len(by_name) > 0 + ids = set(by_name["monitoring_location_id"]) + assert set(by_postal["monitoring_location_id"]) == ids + assert set(by_fips["monitoring_location_id"]) == ids + + +def test_get_providers_state_accepts_name_postal_or_fips(): + """``get_providers`` likewise normalizes any encoding to the uppercase + postal code the ``providers`` collection queries on.""" + by_postal, _ = ngwmn.get_providers(state="WI") + by_name, _ = ngwmn.get_providers(state="Wisconsin") + by_fips, _ = ngwmn.get_providers(state="55") + assert len(by_postal) > 0 + agencies = set(by_postal["agency_code"]) + assert set(by_name["agency_code"]) == agencies + assert set(by_fips["agency_code"]) == agencies + + +def test_state_queryables_still_diverge_upstream(): + """The NGWMN ``sites`` and ``providers`` collections expose DIFFERENT state + queryables (``sites`` -> ``state_name`` full name; ``providers`` -> + ``state`` 2-letter code). The single-``state`` shim in + ``ngwmn._resolve_state`` exists ONLY to paper over that asymmetry. + + If this test fails, the upstream API has unified the two queryables and the + shim (``_resolve_state``) can be removed in favor of a single pass-through + parameter. + """ + import httpx + + from dataretrieval.ngwmn import NGWMN_OGC_API_URL + from dataretrieval.ogc.engine import _default_headers + + headers = _default_headers() + + def queryables(collection): + resp = httpx.get( + f"{NGWMN_OGC_API_URL}/collections/{collection}/queryables", + headers=headers, + timeout=60, + ) + resp.raise_for_status() + return set(resp.json().get("properties") or {}) + + sites_q = queryables("sites") + providers_q = queryables("providers") + assert "state_name" in sites_q and "state" not in sites_q, sites_q + assert "state" in providers_q and "state_name" not in providers_q, providers_q + + +def test_get_lithology(): + df, _ = ngwmn.get_lithology(monitoring_location_id=_LITH_SITE) + assert isinstance(df, DataFrame) + assert len(df) > 0 + assert (df["monitoring_location_id"] == _LITH_SITE).all() + + +def test_get_well_construction(): + df, _ = ngwmn.get_well_construction(monitoring_location_id=_SITE) + assert isinstance(df, DataFrame) + assert len(df) > 0 + assert (df["monitoring_location_id"] == _SITE).all() + + +def test_multi_site_chunks_and_unions(): + """A multi-value ``monitoring_location_id`` fans out and unions the + per-site results (the comma-join multi-value path), returning at least + the single-site total.""" + one, _ = ngwmn.get_water_level(monitoring_location_id=_SITE) + many, _ = ngwmn.get_water_level( + monitoring_location_id=[_SITE, "USGS-404159100494601"] + ) + assert len(many) >= len(one) diff --git a/tests/utils_test.py b/tests/utils_test.py index d90821ae..81f82f45 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -173,7 +173,7 @@ def test_waterdata_exceptions_share_the_root(self): ``except`` clause spans the legacy and waterdata subsystems, and they slot under the shared family bases (``HTTPError`` / ``TransientError`` / ``RequestTooLarge``).""" - from dataretrieval.waterdata.chunking import ( + from dataretrieval.ogc.chunking import ( ChunkInterrupted, RateLimited, ServiceUnavailable, @@ -195,6 +195,25 @@ def test_base_exported_at_top_level(self): assert dataretrieval.DataRetrievalError is exceptions.DataRetrievalError + def test_chunk_interruptions_exported_at_top_level(self): + """The resumable chunk-interruption exceptions are reachable from the + top level (``from dataretrieval import ChunkInterrupted``) instead of + only the internal ``dataretrieval.ogc.chunking`` module, and resolve to + the same classes.""" + import dataretrieval + from dataretrieval.ogc import chunking + + for name in ("ChunkInterrupted", "QuotaExhausted", "ServiceInterrupted"): + assert getattr(dataretrieval, name) is getattr(chunking, name) + assert name in dataretrieval.__all__ + assert issubclass(dataretrieval.QuotaExhausted, dataretrieval.ChunkInterrupted) + assert issubclass( + dataretrieval.ServiceInterrupted, dataretrieval.ChunkInterrupted + ) + assert issubclass( + dataretrieval.ChunkInterrupted, dataretrieval.DataRetrievalError + ) + class Test_BaseMetadata: """Tests of BaseMetadata""" @@ -309,3 +328,53 @@ def test_existing_datetime_column_not_overwritten(self): ) df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"].tolist() == ["preexisting"] + + +class Test_to_state: + """Tests of the shared state normalizer in ``codes.states``.""" + + def test_accepts_every_encoding(self): + from dataretrieval.codes.states import to_state + + # name (any case), postal (any case), bare FIPS, and prefixed FIPS all + # resolve to the same canonical full name. + for value in ("Wisconsin", "wisconsin", "WI", "wi", "55", "US:55"): + assert to_state(value) == "Wisconsin" + + def test_converts_to_each_representation(self): + from dataretrieval.codes.states import to_state + + assert to_state("WI", "name") == "Wisconsin" + assert to_state("Wisconsin", "postal") == "WI" + assert to_state("Wisconsin", "fips") == "55" + assert to_state("Wisconsin", "fips_us") == "US:55" + # Conversion is independent of the input encoding. + assert to_state("55", "postal") == "WI" + assert to_state("wi", "fips_us") == "US:55" + + def test_rejects_unrecognized_state(self): + from dataretrieval.codes.states import to_state + + for bad in ("XX", "99", "US:99", "Wisconson"): + with pytest.raises(ValueError, match="not a recognized US state"): + to_state(bad) + + def test_rejects_unknown_target(self): + from dataretrieval.codes.states import to_state + + with pytest.raises(ValueError, match="to must be"): + to_state("WI", "zipcode") + + def test_resolves_an_iterable_element_wise(self): + from dataretrieval.codes.states import to_state + + # An iterable of mixed encodings returns a list, converted element-wise. + assert to_state(["WI", "Minnesota", "39"]) == [ + "Wisconsin", + "Minnesota", + "Ohio", + ] + assert to_state(["WI", "CA"], "fips_us") == ["US:55", "US:06"] + # A bad element fails the whole call (fail-fast). + with pytest.raises(ValueError, match="not a recognized US state"): + to_state(["WI", "XX"]) diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py index 37e9b999..d024b704 100644 --- a/tests/waterdata_chunking_test.py +++ b/tests/waterdata_chunking_test.py @@ -1,4 +1,4 @@ -"""Tests for ``dataretrieval.waterdata.chunking``. +"""Tests for ``dataretrieval.ogc.chunking``. These tests exercise the joint planner with a fake ``build_request`` whose URL byte length is a deterministic function of its inputs: @@ -17,6 +17,7 @@ import asyncio import concurrent.futures +import contextvars import datetime import http.server import sys @@ -35,10 +36,8 @@ pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) from dataretrieval.exceptions import DataRetrievalError -from dataretrieval.utils import HTTPX_DEFAULTS -from dataretrieval.waterdata import chunking as _chunking -from dataretrieval.waterdata import utils as _utils -from dataretrieval.waterdata.chunking import ( +from dataretrieval.ogc import chunking as _chunking +from dataretrieval.ogc.chunking import ( _LIST_SEP, _NEVER_CHUNK, _OR_SEP, @@ -63,6 +62,8 @@ get_active_client, multi_value_chunked, ) +from dataretrieval.utils import HTTPX_DEFAULTS +from dataretrieval.waterdata import utils as _utils from dataretrieval.waterdata.utils import _DATE_RANGE_PARAMS, _construct_api_requests @@ -350,7 +351,7 @@ async def fetch(args): def test_multi_value_chunked_lazy_url_limit(monkeypatch): - """``url_limit=None`` → resolve chunking._WATERDATA_URL_BYTE_LIMIT at call + """``url_limit=None`` → resolve chunking._OGC_URL_BYTE_LIMIT at call time, so tests that patch the constant affect this decorator too.""" calls = [] @@ -361,7 +362,7 @@ async def fetch(args): elapsed=datetime.timedelta(seconds=0.1), headers={} ) - monkeypatch.setattr(_chunking, "_WATERDATA_URL_BYTE_LIMIT", 240) + monkeypatch.setattr(_chunking, "_OGC_URL_BYTE_LIMIT", 240) # 4 sites of 10 chars → exceeds 240 → planner splits. fetch({"sites": ["S" * 10 + str(i) for i in range(4)]}) assert len(calls) > 1, "patched constant should drive chunking" @@ -659,6 +660,55 @@ async def fetch(args): assert sorted(df_a["id"].tolist()) == sorted(sites) +def test_resume_rebuilds_in_captured_context(): + """Regression: sub-requests are rebuilt by reading ambient ContextVars + (the engine threads base URL / dialect / row cap that way). A + ``call.resume()`` fired AFTER the originating ``with`` block exits — + the documented recovery for a mid-stream 429 — must still observe the + values active when the call was *created*, not the process defaults. + ``ChunkedCall`` snapshots the context at construction and runs every + drive inside it; without that snapshot a resumed NGWMN call would + rebuild its sub-requests against the wrong (default Water Data) base.""" + var = contextvars.ContextVar("ctx_probe", default="DEFAULT") + observed: list[str] = [] + + state = {"calls": 0, "tripped": False} + + async def fetch(args): + state["calls"] += 1 + # The value visible at (re)build time — what _construct_api_requests + # would read from _ogc_base_url_var / _dialect_var in production. + observed.append(var.get()) + if state["calls"] == 3 and not state["tripped"]: + state["tripped"] = True + raise RateLimited("429: Too many requests made.") + sites = list(args["sites"]) + return (pd.DataFrame({"id": sites}), _quota_response(500)) + + sites = ["S" * 10 + str(i) for i in range(16)] + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + + # Create + drive the call INSIDE the context, so the snapshot captures "IN". + token = var.set("IN") + try: + with pytest.raises(QuotaExhausted) as excinfo: + decorated({"sites": sites}) + finally: + var.reset(token) + + # The originating context has exited — the bare var is back to default. + assert var.get() == "DEFAULT" + assert 0 < excinfo.value.completed_chunks < excinfo.value.total_chunks + + # Resume OUTSIDE the context. Every rebuilt sub-request must still see + # "IN" (the captured snapshot), never the leaked "DEFAULT". + observed.clear() + df, _ = excinfo.value.call.resume() + assert observed, "resume issued no sub-requests" + assert set(observed) == {"IN"}, observed + assert sorted(df["id"].tolist()) == sorted(sites) + + def test_chunker_passes_through_non_429_runtime_error(): """A non-429 ``RuntimeError`` (e.g. a 500) is not a quota signal; it must propagate unchanged so callers see the real cause.""" @@ -994,7 +1044,7 @@ def test_combine_chunk_responses_returns_independent_headers(): def test_paginate_terminates_on_empty_string_cursor(): """``_paginate``'s loop predicate is ``while cursor is not None``. - Parse-response wrappers in ``_walk_pages`` / ``get_stats_data`` + Parse-response wrappers in ``_walk_pages`` / ``stats.get_data`` coerce falsy non-None values to None so an empty-string next- cursor (a real-but-unusual end-of-stream sentinel some pagination APIs use) doesn't trap us in an infinite ``follow_up('')`` loop.""" diff --git a/tests/waterdata_filters_test.py b/tests/waterdata_filters_test.py index a447cada..b87ec272 100644 --- a/tests/waterdata_filters_test.py +++ b/tests/waterdata_filters_test.py @@ -6,11 +6,12 @@ import pandas as pd import pytest -from dataretrieval.waterdata import get_continuous -from dataretrieval.waterdata.filters import ( +from dataretrieval.ogc.filters import ( _check_numeric_filter_pitfall, + _quote_cql_str, _split_top_level_or, ) +from dataretrieval.waterdata import get_continuous from dataretrieval.waterdata.utils import _construct_api_requests @@ -32,6 +33,14 @@ def _fake_response(url="https://example.test", elapsed_ms=1): ) +def test_quote_cql_str_doubles_embedded_quotes(): + """The shared CQL-text escaper doubles ``'`` and leaves other input + untouched (the contract ``waterdata.ratings._build_filter`` relies on).""" + assert _quote_cql_str("O'Brien") == "O''Brien" + assert _quote_cql_str("USGS-01646500") == "USGS-01646500" + assert _quote_cql_str("a'b'c") == "a''b''c" + + def test_construct_filter_passthrough(): """`filter` is forwarded verbatim as a query parameter.""" expr = ( @@ -163,11 +172,11 @@ async def fake_walk_pages(*, geopd, req): with ( mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", + "dataretrieval.ogc.engine._construct_api_requests", side_effect=_filter_size_aware_build, ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", + "dataretrieval.ogc.engine._walk_pages", side_effect=fake_walk_pages, ), ): @@ -202,11 +211,11 @@ async def fake_walk_pages(*_args, **_kwargs): with ( mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", + "dataretrieval.ogc.engine._construct_api_requests", side_effect=_filter_size_aware_build, ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", + "dataretrieval.ogc.engine._walk_pages", side_effect=fake_walk_pages, ), ): @@ -249,11 +258,11 @@ async def fake_walk_pages(*_args, **_kwargs): with ( mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", + "dataretrieval.ogc.engine._construct_api_requests", side_effect=_filter_size_aware_build, ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", + "dataretrieval.ogc.engine._walk_pages", side_effect=fake_walk_pages, ), ): @@ -281,11 +290,11 @@ def fake_construct_api_requests(**kwargs): with ( mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", + "dataretrieval.ogc.engine._construct_api_requests", side_effect=fake_construct_api_requests, ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", + "dataretrieval.ogc.engine._walk_pages", new=mock.AsyncMock( return_value=( pd.DataFrame({"id": ["row-1"], "value": [1]}), @@ -426,7 +435,7 @@ def test_get_continuous_surfaces_pitfall_to_caller(): """End-to-end: the check runs at the ``get_continuous`` boundary, not as a deep internal-only protection, so callers see the error before any HTTP traffic.""" - with mock.patch("dataretrieval.waterdata.utils._construct_api_requests") as build: + with mock.patch("dataretrieval.ogc.engine._construct_api_requests") as build: with pytest.raises(ValueError, match="lexicographic"): get_continuous( monitoring_location_id="USGS-02238500", diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 08f6ca26..0ba801e1 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -17,13 +17,13 @@ import pandas as pd import pytest -from dataretrieval.waterdata import _progress -from dataretrieval.waterdata._progress import ( +from dataretrieval.ogc import progress as _progress +from dataretrieval.ogc.chunking import ChunkedCall, ChunkPlan +from dataretrieval.ogc.progress import ( ProgressReporter, current, progress_context, ) -from dataretrieval.waterdata.chunking import ChunkedCall, ChunkPlan from dataretrieval.waterdata.utils import _paginate, _walk_pages diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 34ccf4f2..a68033e3 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -11,6 +11,7 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) +from dataretrieval.ogc.engine import _dialect from dataretrieval.waterdata import ( get_channel, get_combined_metadata, @@ -31,6 +32,7 @@ get_time_series_metadata, ) from dataretrieval.waterdata.utils import ( + WATERDATA_DIALECT, _check_monitoring_location_id, _check_profiles, _construct_api_requests, @@ -60,6 +62,19 @@ ) +@pytest.fixture(autouse=True) +def _activate_waterdata_dialect(): + """Make the Water Data OGC dialect ambient for this module. + + The dialect (monitoring-locations -> POST/CQL2; daily -> date-only time + args) is normally set by ``get_ogc_data`` per call. The direct + ``_construct_api_requests`` unit tests here bypass it, so activate the + dialect module-wide so they exercise the real Water Data behavior. + """ + with _dialect(WATERDATA_DIALECT): + yield + + def mock_request(httpx_mock, request_url, file_path): """Mock request code""" with open(file_path) as text: @@ -136,7 +151,7 @@ def test_get_samples_raises_typed_error_on_429(httpx_mock): """Non-200 from the Samples endpoint now raises the module's typed error (RateLimited on 429) — consistent with the OGC/stats path — instead of a bare httpx.HTTPStatusError.""" - from dataretrieval.waterdata.chunking import RateLimited + from dataretrieval.ogc.chunking import RateLimited httpx_mock.add_response(status_code=429, headers={"Retry-After": "30"}) with pytest.raises(RateLimited): @@ -149,7 +164,7 @@ def test_get_samples_raises_typed_error_on_429(httpx_mock): def test_get_samples_summary_raises_typed_error_on_5xx(httpx_mock): """A 5xx from the Samples summary endpoint raises ServiceUnavailable.""" - from dataretrieval.waterdata.chunking import ServiceUnavailable + from dataretrieval.ogc.chunking import ServiceUnavailable httpx_mock.add_response(status_code=503) with pytest.raises(ServiceUnavailable): diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 10733773..f68e1f7e 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -8,9 +8,13 @@ import pandas as pd import pytest +import dataretrieval.ogc.engine as _engine_module +import dataretrieval.waterdata.stats as _stats_module import dataretrieval.waterdata.utils as _utils_module from dataretrieval.exceptions import DataRetrievalError, HTTPError, TransientError -from dataretrieval.waterdata.chunking import RateLimited, ServiceUnavailable +from dataretrieval.ogc.chunking import RateLimited, ServiceUnavailable +from dataretrieval.waterdata import get_stats_date_range, get_stats_por +from dataretrieval.waterdata.stats import _handle_nesting, get_data from dataretrieval.waterdata.utils import ( OGC_API_URL, _arrange_cols, @@ -20,13 +24,12 @@ _format_api_dates, _get_args, _get_resp_data, - _handle_stats_nesting, _next_req_url, _parse_retry_after, _raise_for_non_200, _row_cap, + _to_snake_case, _walk_pages, - get_stats_data, ) _LOGGER_NAME = _utils_module.__name__ @@ -332,7 +335,7 @@ def test_get_resp_data_handles_missing_features_key(): """Regression: a 200 with ``numberReturned > 0`` but no ``features`` key (real schema-drift shape) used to crash ``_get_resp_data`` with ``KeyError`` — wrapped downstream by - ``_paginate`` as a generic transport error. ``_handle_stats_nesting`` + ``_paginate`` as a generic transport error. ``_handle_nesting`` was already hardened against this; ``_get_resp_data`` now mirrors that defensiveness and returns an empty frame instead.""" resp = mock.Mock() @@ -342,6 +345,32 @@ def test_get_resp_data_handles_missing_features_key(): assert isinstance(df, pd.DataFrame) +def test_next_req_url_follows_link_without_number_returned(): + """The NGWMN OGC API omits ``numberReturned`` from its page envelope, so + ``_next_req_url`` keys the ``next`` link off ``features`` (mirroring + ``_get_resp_data``) rather than that count -- otherwise a page that carries + features but no count stops pagination after page 1 and silently truncates + every multi-page result. A page that carries features still follows its + ``next`` link even when ``numberReturned`` is absent.""" + resp = mock.MagicMock() + resp.url = httpx.URL("https://example.com/page1") + body = { + # NGWMN shape: features present, NO numberReturned key. + "features": [{"id": "1"}], + "links": [{"rel": "next", "href": "https://example.com/page2"}], + } + assert _next_req_url(resp, body=body) == "https://example.com/page2" + + +def test_next_req_url_stops_when_no_features(): + """A page with no features ends pagination regardless of any stray + ``next`` link (and regardless of ``numberReturned``).""" + resp = mock.MagicMock() + resp.url = httpx.URL("https://example.com/page1") + body = {"features": [], "links": [{"rel": "next", "href": "https://x/2"}]} + assert _next_req_url(resp, body=body) is None + + def test_walk_pages_does_not_mutate_initial_response(): """The aggregated response returned from ``_walk_pages`` is built via ``_aggregate_paginated_response``, which returns a fresh copy. @@ -413,15 +442,15 @@ def _stats_initial_ok(): return resp -def _run_get_stats_data_with_failure(failure_resp_or_exc, monkeypatch): - """Exercise get_stats_data where the initial response succeeds and the +def _run_get_data_with_failure(failure_resp_or_exc, monkeypatch): + """Exercise get_data where the initial response succeeds and the paginated follow-up fails as given. Mirrors _walk_pages_with_failure. - `monkeypatch` stubs ``_handle_stats_nesting`` so the synthetic minimal + `monkeypatch` stubs ``_handle_nesting`` so the synthetic minimal response body doesn't need to parse — these tests only assert on the pagination loop's error surfacing.""" monkeypatch.setattr( - _utils_module, - "_handle_stats_nesting", + _stats_module, + "_handle_nesting", mock.MagicMock(return_value=pd.DataFrame()), ) @@ -432,7 +461,7 @@ def _run_get_stats_data_with_failure(failure_resp_or_exc, monkeypatch): else: mock_client.request.return_value = failure_resp_or_exc - return get_stats_data( + return get_data( args={"monitoring_location_id": "USGS-1"}, service="observationNormals", expand_percentiles=False, @@ -440,14 +469,14 @@ def _run_get_stats_data_with_failure(failure_resp_or_exc, monkeypatch): ) -def test_get_stats_data_raises_on_mid_pagination_failure(monkeypatch): - """Wiring smoke: ``get_stats_data`` and ``_walk_pages`` share the +def test_get_data_raises_on_mid_pagination_failure(monkeypatch): + """Wiring smoke: ``get_data`` and ``_walk_pages`` share the same ``_paginate`` strategy helper, so error-routing behaviour is exercised by the ``_walk_pages`` triplet above. This single - ``get_stats_data`` mid-pagination case proves the stats-specific + ``get_data`` mid-pagination case proves the stats-specific follow-up callback is wired into ``_paginate`` correctly.""" with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: - _run_get_stats_data_with_failure( + _run_get_data_with_failure( httpx.ConnectError("stats-boom"), monkeypatch, ) @@ -456,7 +485,7 @@ def test_get_stats_data_raises_on_mid_pagination_failure(monkeypatch): assert "stats-boom" in str(excinfo.value) -def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): +def test_get_data_warning_includes_next_token(caplog, monkeypatch): """The pagination-failure warning includes the next_token so operators can identify which page in the sequence failed. (Addresses Copilot's PR #273 review note: the base URL alone drops cursor context.)""" @@ -470,17 +499,18 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): } with pytest.raises(DataRetrievalError): - _run_get_stats_data_with_failure(page2_503, monkeypatch) + _run_get_data_with_failure(page2_503, monkeypatch) warnings_ = [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] # The initial response from _stats_initial_ok carries next=tok2. assert any("tok2" in m for m in warnings_), warnings_ -def test_handle_stats_nesting_tolerates_missing_drop_columns(): - """If the upstream stats response shape ever changes such that one of - the columns we try to drop ("type", "properties.data") is absent, the - function should still return a DataFrame instead of raising KeyError. +def test_handle_nesting_tolerates_missing_drop_columns(): + """If the upstream stats response shape ever changes such that the nested + ``data`` column ``_handle_nesting`` drops is absent, the function should + still return a DataFrame instead of raising KeyError (the drop uses + ``errors="ignore"``). """ body = { "next": None, @@ -501,38 +531,40 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns(): ], } - df = _handle_stats_nesting(body, geopd=False) + df = _handle_nesting(body, geopd=False) assert len(df) == 1 assert df["monitoring_location_id"].iloc[0] == "USGS-12345" -def test_handle_stats_nesting_returns_empty_on_empty_features(): +def test_handle_nesting_returns_empty_on_empty_features(): """A mid-pagination empty page ({\"features\": [], \"next\": }) must not crash the downstream merge with ``KeyError: 'monitoring_location_id'``. The function short- circuits to an empty DataFrame so pagination can continue.""" - df = _handle_stats_nesting({"features": [], "next": None}, geopd=False) + df = _handle_nesting({"features": [], "next": None}, geopd=False) assert df.empty -def test_handle_stats_nesting_empty_preserves_geopd_type(): +def test_handle_nesting_empty_preserves_geopd_type(): """When geopandas is available, the empty-features short-circuit must return a ``GeoDataFrame`` rather than a plain ``DataFrame``. Otherwise a subsequent ``pd.concat([empty, geo_page])`` downgrades the final result to a plain ``DataFrame`` and strips geometry/CRS — a real regression for geopd-installed users on stats queries that hit an empty intermediate page.""" - # Monkeypatch a stub gpd into the utils module so the test runs - # whether or not geopandas is actually installed. + # Monkeypatch a stub gpd so the test runs whether or not geopandas is + # installed. The empty-page short-circuit delegates to the shared + # ``engine._empty_feature_frame``, which resolves ``gpd`` from the engine + # namespace — so patch it there, not in the stats module. fake_gpd = mock.MagicMock() class _Sentinel: pass fake_gpd.GeoDataFrame = lambda *a, **kw: _Sentinel() - with mock.patch.object(_utils_module, "gpd", fake_gpd, create=True): - result = _handle_stats_nesting({"features": []}, geopd=True) + with mock.patch.object(_engine_module, "gpd", fake_gpd, create=True): + result = _handle_nesting({"features": []}, geopd=True) assert isinstance(result, _Sentinel) @@ -551,17 +583,19 @@ class _Sentinel: resp = mock.MagicMock() resp.json.return_value = {"numberReturned": 0, "features": [], "links": []} - with mock.patch.object(_utils_module, "gpd", fake_gpd, create=True): + # ``_get_resp_data`` resolves ``gpd`` from the engine namespace -- patch + # it there, not in ``utils``. + with mock.patch.object(_engine_module, "gpd", fake_gpd, create=True): result = _get_resp_data(resp, geopd=True) assert isinstance(result, _Sentinel) -def test_handle_stats_nesting_tolerates_missing_features_key(): +def test_handle_nesting_tolerates_missing_features_key(): """A 200 response with a body that doesn't carry ``features`` at all (rare but seen in error envelopes) must also short-circuit rather than KeyError before the schema-aware extraction even runs.""" - df = _handle_stats_nesting({}, geopd=False) + df = _handle_nesting({}, geopd=False) assert df.empty @@ -860,3 +894,117 @@ def test_check_ogc_requests_raises_typed_on_5xx(httpx_mock): ) with pytest.raises(ServiceUnavailable): _check_ogc_requests(endpoint="daily", req_type="schema") + + +@pytest.mark.parametrize( + "name, expected", + [ + ("waterLevelObs", "water_level_obs"), # camelCase -> snake_case + ("monitoring_location_id", "monitoring_location_id"), # already snake + ("value", "value"), # all-lowercase unchanged + ("navd88", "navd88"), # letter/digit boundary NOT split + ("someField", "some_field"), # simple camelCase + ("PascalCase", "pascal_case"), # leading capital + # Runs of capitals are best-effort: only the lower->Upper boundary + # before the run is split, so the acronym stays glued to the next word. + ("someXMLField", "some_xmlfield"), + ], +) +def test_to_snake_case(name, expected): + assert _to_snake_case(name) == expected + + +def test_get_stats_por_forwards_normal_type(monkeypatch): + """``normal_type`` reaches the observationNormals request (parity with R's + ``read_waterdata_stats_por``). Guards against the param being dropped from + the forwarded args (e.g. accidentally added to ``_get_args``'s exclude).""" + captured: dict = {} + + def fake_get_data(args, service, expand_percentiles, client=None): + captured.update(args=args, service=service) + return pd.DataFrame(), mock.Mock() + + monkeypatch.setattr(_stats_module, "get_data", fake_get_data) + get_stats_por(monitoring_location_id="USGS-1", normal_type="MOY") + assert captured["service"] == "observationNormals" + assert captured["args"].get("normal_type") == "MOY" + + +def test_get_stats_date_range_forwards_interval_type(monkeypatch): + """``interval_type`` (multi-value) reaches the observationIntervals request + (parity with R's ``read_waterdata_stats_daterange``).""" + captured: dict = {} + + def fake_get_data(args, service, expand_percentiles, client=None): + captured.update(args=args, service=service) + return pd.DataFrame(), mock.Mock() + + monkeypatch.setattr(_stats_module, "get_data", fake_get_data) + get_stats_date_range(monitoring_location_id="USGS-1", interval_type=["M", "CY"]) + assert captured["service"] == "observationIntervals" + assert captured["args"].get("interval_type") == ["M", "CY"] + + +def test_with_state_routes_into_native_queryable(): + """``_with_state`` resolves the canonical ``state`` argument into the + endpoint's native queryable (any encoding -> the requested representation) + and leaves args without ``state`` untouched.""" + assert _utils_module._with_state({"state": "WI"}, to="name", into="state_name") == { + "state_name": "Wisconsin" + } + assert _utils_module._with_state( + {"state": "Wisconsin"}, to="fips_us", into="state_code" + ) == {"state_code": "US:55"} + # Multi-value state fans out element-wise. + assert _utils_module._with_state( + {"state": ["WI", "55"]}, to="name", into="state_name" + ) == {"state_name": ["Wisconsin", "Wisconsin"]} + # No ``state`` -> mapping returned unchanged. + assert _utils_module._with_state( + {"state_name": "Ohio"}, to="name", into="state_name" + ) == {"state_name": "Ohio"} + + +def test_with_state_conflict_raises(): + """Passing ``state`` together with a native ``state_code``/``state_name`` + is ambiguous and raises.""" + with pytest.raises(ValueError, match="not both"): + _utils_module._with_state( + {"state": "WI", "state_code": "55"}, to="name", into="state_name" + ) + with pytest.raises(ValueError, match="not both"): + _utils_module._with_state( + {"state": "WI", "state_name": "Wisconsin"}, to="name", into="state_name" + ) + + +def test_ogc_getter_resolves_state_at_getter_layer(monkeypatch): + """The OGC getters resolve the unified ``state`` into ``state_name`` + themselves (any encoding), so the shared ``get_ogc_data`` wrapper stays + state-agnostic.""" + import dataretrieval.waterdata.api as _api + + captured: dict = {} + + def fake_get_ogc_data(args, service, *a, **k): + captured.update(args=args, service=service) + return pd.DataFrame(), mock.Mock() + + monkeypatch.setattr(_api, "get_ogc_data", fake_get_ogc_data) + _api.get_monitoring_locations(state="55") # FIPS in -> full name out + assert captured["args"].get("state_name") == "Wisconsin" + assert "state" not in captured["args"] + + +def test_get_ogc_data_wrapper_does_not_touch_state(): + """``get_ogc_data`` no longer rewrites a ``state`` key, so a passthrough + query dict (e.g. from ``get_reference_table``) is forwarded untouched.""" + captured: dict = {} + + def fake_engine_get_ogc_data(args, service, output_id, **k): + captured["args"] = dict(args) + return pd.DataFrame(), mock.Mock() + + with mock.patch.object(_engine_module, "get_ogc_data", fake_engine_get_ogc_data): + _utils_module.get_ogc_data({"state": "WI"}, "monitoring-locations") + assert captured["args"] == {"state": "WI"}