From e3047631f9249ede41c4d400ae2f87be048e1d04 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 10:11:35 -0500 Subject: [PATCH 1/7] Add waterdata.get_ratings for USGS stage-discharge ratings via STAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps the new Water Data STAC catalog endpoint (api.waterdata.usgs.gov/stac/v0/search) for stage-discharge rating curves. Lives in its own module (waterdata/ratings.py) because the transport layer — STAC search + per-feature RDB download — differs from the OGC collections used by the rest of the package. The function: - Composes a CQL filter from monitoring_location_id and (single-value) file_type, mirroring R's logic. Multi-type requests fetch all matches and filter URLs client-side. - Optionally downloads each matching .rdb asset to a user-supplied file_path (default: a fresh tempfile.mkdtemp), and parses with the existing nwis._read_rdb helper. - Returns a dict[id -> DataFrame] when download_and_parse=True, or the raw list of STAC features when False (cheap "what's available?" inspection). Surfaces a clear ValueError for invalid file_type values and for ISO 8601 durations in `datetime` (the rating-curve service rejects them). Mirrors R's read_waterdata_ratings in DOI-USGS/dataRetrieval; example sites and idioms come straight from the R doc. Tests: 9 unit tests covering filter composition, the two error paths, and end-to-end search-and-download via requests_mock (single-site, download_and_parse=False, and multi-type URL filtering). Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/__init__.py | 2 + dataretrieval/waterdata/ratings.py | 260 ++++++++++++++++++++++++++++ tests/waterdata_ratings_test.py | 154 ++++++++++++++++ 4 files changed, 418 insertions(+) create mode 100644 dataretrieval/waterdata/ratings.py create mode 100644 tests/waterdata_ratings_test.py diff --git a/NEWS.md b/NEWS.md index beabe9d8..7413b820 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. + **05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`. **05/05/2026:** Added `waterdata.get_samples_summary(monitoringLocationIdentifier=...)` — wraps the Samples database `/summary/{id}` endpoint, returning per-characteristic result and activity counts plus first / most recent activity dates for a single monitoring location. Useful for taking inventory of available discrete-sample data before pulling observations with `get_samples`. diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 22fb7d38..b7ff474b 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -29,6 +29,7 @@ ) from .filters import FILTER_LANG from .nearest import get_nearest_continuous +from .ratings import get_ratings from .types import ( CODE_SERVICES, PROFILE_LOOKUP, @@ -52,6 +53,7 @@ "get_latest_daily", "get_monitoring_locations", "get_nearest_continuous", + "get_ratings", "get_reference_table", "get_samples", "get_samples_summary", diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py new file mode 100644 index 00000000..813fa592 --- /dev/null +++ b/dataretrieval/waterdata/ratings.py @@ -0,0 +1,260 @@ +"""USGS rating-curve retrieval via the Water Data STAC catalog. + +Wraps ``https://api.waterdata.usgs.gov/stac/v0/search`` and the per-feature +RDB downloads that follow. The STAC endpoint hosts standard NWIS rating +files (``exsa``, ``base``, ``corr``) for active streamgages — see the +service overview at https://api.waterdata.usgs.gov/docs/stac/ and the +WDFN announcement at https://waterdata.usgs.gov/blog/wdfn-rating-curves/. + +This is the discrete analogue to the OGC waterdata getters; it lives in +its own module because the transport layer (STAC search + RDB download) +differs from the OGC collections used by the rest of the package. + +The R analogue is ``read_waterdata_ratings`` in +https://github.com/DOI-USGS/dataRetrieval/. +""" + +from __future__ import annotations + +import logging +import os +import tempfile +from typing import Any + +import pandas as pd +import requests + +from dataretrieval.nwis import _read_rdb + +from .utils import BASE_URL, _default_headers, _format_api_dates + +logger = logging.getLogger(__name__) + +STAC_URL = f"{BASE_URL}/stac/v0" +_VALID_FILE_TYPES: tuple[str, ...] = ("exsa", "base", "corr") + + +def _build_filter( + monitoring_location_id: str | list[str] | None, + file_type: str | None, +) -> str | None: + """Compose the CQL filter sent to STAC ``/search``. + + Mirrors R's logic: only pin ``file_type`` when a single value was given, + so a multi-type request returns every matching site and the file-type + filtering happens client-side from the per-feature URLs. + """ + parts: list[str] = [] + if monitoring_location_id is not None: + ids = ( + [monitoring_location_id] + if isinstance(monitoring_location_id, str) + else list(monitoring_location_id) + ) + joined = "', '".join(ids) + parts.append(f"monitoring_location_id IN ('{joined}')") + if file_type is not None: + parts.append(f"file_type = '{file_type}'") + return " AND ".join(parts) if parts else None + + +def _search( + filter_str: str | None, + datetime_str: str | None, + bbox: list[float] | None, + limit: int, + ssl_check: bool, +) -> list[dict[str, Any]]: + """Run a single STAC ``/search`` request and return its features.""" + params: dict[str, Any] = {"limit": limit} + if filter_str is not None: + params["filter"] = filter_str + if datetime_str is not None: + params["datetime"] = datetime_str + if bbox is not None: + params["bbox"] = ",".join(str(b) for b in bbox) + + response = requests.get( + f"{STAC_URL}/search", + params=params, + headers=_default_headers(), + verify=ssl_check, + ) + response.raise_for_status() + return response.json().get("features", []) + + +def _download_and_parse( + feature: dict[str, Any], + file_path: str, + ssl_check: bool, +) -> pd.DataFrame: + """Fetch the feature's data asset, write it to ``file_path``, parse RDB.""" + url = feature["assets"]["data"]["href"] + fid = feature["id"] + target = os.path.join(file_path, fid) + + response = requests.get(url, headers=_default_headers(), verify=ssl_check) + response.raise_for_status() + + with open(target, "w") as f: + f.write(response.text) + + return _read_rdb(response.text) + + +def get_ratings( + monitoring_location_id: str | list[str] | None = None, + file_type: str | list[str] = "exsa", + file_path: str | None = None, + datetime: str | list[str] | None = None, + bbox: list[float] | None = None, + limit: int = 10000, + download_and_parse: bool = True, + ssl_check: bool = True, +) -> dict[str, pd.DataFrame] | list[dict[str, Any]]: + """Get USGS stage-discharge rating curves from the Water Data STAC catalog. + + Returns the current rating tables for one or more active USGS streamgages. + The catalog hosts three file types: + + - ``"exsa"`` — expanded shift-adjusted rating (default). Adds a ``SHIFT`` + column to ``"base"`` indicating the current shift for each ``INDEP``. + - ``"base"`` — three columns: ``INDEP`` (typically gage height, ft); + ``DEP`` (typically discharge, ft^3/s); ``STOR`` ("``*``" marks fixed + points of the rating). + - ``"corr"`` — three columns: ``INDEP``; ``CORR`` (correction for that + value); ``CORRINDEP`` (corrected INDEP). + + See https://api.waterdata.usgs.gov/docs/stac/ for the upstream service + docs and https://waterdata.usgs.gov/blog/wdfn-rating-curves/ for the + background announcement. The R analogue is ``read_waterdata_ratings`` + in https://github.com/DOI-USGS/dataRetrieval/. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + One or more identifiers in ``AGENCY-ID`` form (e.g. + ``"USGS-01104475"``). If omitted, the spatial / temporal filters + determine the result set. + file_type : string or list of strings, default ``"exsa"`` + Which rating file(s) to request. One or more of ``"exsa"``, + ``"base"``, ``"corr"``. + file_path : string, optional + Directory the downloaded RDB files are written to. Defaults to a + per-call temporary directory created via :func:`tempfile.mkdtemp`. + datetime : string or list of strings, optional + STAC ``datetime`` filter — a single date / datetime, or an + interval (``"start/end"``, optionally half-bounded with ``..``). + ISO 8601 *durations* (``"P1M"``, ``"PT36H"``, …) are **not** + supported by the rating-curve service; passing one raises + ``ValueError``. + bbox : list of numbers, optional + Only features whose geometry intersects the bounding box are + selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 + (longitude / latitude, west-south-east-north). + limit : int, default 10000 + Page size for the STAC ``/search`` request (capped at 10000). + download_and_parse : bool, default ``True`` + If ``True``, download every matching RDB file and parse it into a + ``DataFrame``. If ``False``, return the raw list of STAC feature + dicts so the caller can inspect what's available before pulling + bytes. + ssl_check : bool, default ``True`` + Verify the server's SSL certificate. + + Returns + ------- + dict[str, pandas.DataFrame] or list[dict] + When ``download_and_parse=True`` (the default), a dict keyed by + feature ID (e.g. ``"USGS-01104475.exsa.rdb"``) mapping to a parsed + ``DataFrame``. When ``download_and_parse=False``, the raw list of + STAC feature dicts as returned by the search endpoint. + + Raises + ------ + ValueError + For an unrecognized ``file_type`` value or an ISO 8601 duration in + ``datetime``. + + Examples + -------- + .. code:: + + >>> # Default exsa ratings for two sites + >>> ratings = dataretrieval.waterdata.get_ratings( + ... monitoring_location_id=["USGS-01104475", "USGS-01104460"], + ... file_type="exsa", + ... ) + >>> ratings["USGS-01104475.exsa.rdb"].head() + + >>> # Both exsa and corr files for the same two sites + >>> ratings = dataretrieval.waterdata.get_ratings( + ... monitoring_location_id=["USGS-01104475", "USGS-01104460"], + ... file_type=["exsa", "corr"], + ... ) + + >>> # Bounding-box query, listing what's available without downloading + >>> features = dataretrieval.waterdata.get_ratings( + ... bbox=[-95.0, 40.0, -92.0, 42.0], + ... download_and_parse=False, + ... ) + + >>> # Restrict to features modified since seven days ago (no durations) + >>> features = dataretrieval.waterdata.get_ratings( + ... bbox=[-95.0, 40.0, -92.0, 42.0], + ... datetime=["2026-04-29", ".."], + ... download_and_parse=False, + ... ) + + """ + file_types = [file_type] if isinstance(file_type, str) else list(file_type) + invalid = [ft for ft in file_types if ft not in _VALID_FILE_TYPES] + if invalid: + raise ValueError( + f"Invalid file_type {invalid!r}. Valid options: {list(_VALID_FILE_TYPES)}." + ) + + if datetime is not None: + # The rating-curve STAC service rejects ISO 8601 durations; surface a + # clear error rather than letting the server return a confusing 4xx. + dt_values = datetime if isinstance(datetime, list) else [datetime] + if any(v is not None and "P" in str(v).upper() for v in dt_values): + raise ValueError( + "ISO 8601 durations (e.g. 'P7D') are not supported in " + "`datetime` for the rating-curve service. Provide a date or " + "interval instead." + ) + datetime_str = _format_api_dates(datetime, date=False) + else: + datetime_str = None + + # Mirror R: only pin file_type in the server-side filter when one type + # is requested. With multiple types, fetch all and filter URLs locally. + server_file_type = file_types[0] if len(file_types) == 1 else None + filter_str = _build_filter(monitoring_location_id, server_file_type) + + features = _search(filter_str, datetime_str, bbox, limit, ssl_check) + + if not download_and_parse: + return features + + if file_path is None: + file_path = tempfile.mkdtemp(prefix="dataretrieval-ratings-") + os.makedirs(file_path, exist_ok=True) + + out: dict[str, pd.DataFrame] = {} + for feature in features: + url = feature.get("assets", {}).get("data", {}).get("href", "") + # Skip features whose file type wasn't requested (only relevant when + # `file_type` is a list — single-type requests are already filtered + # server-side). + if not any(ft in url for ft in file_types): + continue + fid = feature["id"] + try: + out[fid] = _download_and_parse(feature, file_path, ssl_check) + except Exception as e: + logger.warning("Failed to download / parse %s: %s", fid, e) + + return out diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py new file mode 100644 index 00000000..5962b363 --- /dev/null +++ b/tests/waterdata_ratings_test.py @@ -0,0 +1,154 @@ +import sys +from urllib.parse import parse_qs, urlsplit + +import pandas as pd +import pytest + +if sys.version_info < (3, 10): + pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) + +from dataretrieval.waterdata import get_ratings +from dataretrieval.waterdata.ratings import _build_filter + + +def test_build_filter_single_site_single_type(): + f = _build_filter("USGS-01104475", "exsa") + assert f == "monitoring_location_id IN ('USGS-01104475') AND file_type = 'exsa'" + + +def test_build_filter_multi_site_no_type(): + f = _build_filter(["USGS-A", "USGS-B"], None) + assert f == "monitoring_location_id IN ('USGS-A', 'USGS-B')" + + +def test_build_filter_no_site_single_type(): + f = _build_filter(None, "corr") + assert f == "file_type = 'corr'" + + +def test_build_filter_empty_returns_none(): + assert _build_filter(None, None) is None + + +def test_get_ratings_rejects_invalid_file_type(): + with pytest.raises(ValueError, match="Invalid file_type"): + get_ratings(monitoring_location_id="USGS-01104475", file_type="bogus") + + +def test_get_ratings_rejects_iso_8601_duration_in_datetime(): + """STAC ratings doesn't accept ISO 8601 durations; surface a clear error.""" + with pytest.raises(ValueError, match="durations.*not supported"): + get_ratings( + monitoring_location_id="USGS-01104475", + datetime="P7D", + ) + + +_SAMPLE_RDB = """\ +# header line one +# header line two +agency_cd\tsite_no\tINDEP\tDEP +5s\t15s\t10n\t10n +USGS\t01104475\t0.10\t0.0 +USGS\t01104475\t0.20\t0.5 +USGS\t01104475\t0.30\t1.2 +""" + + +def _stub_search_response(): + return { + "features": [ + { + "id": "USGS-01104475.exsa.rdb", + "properties": {"file_type": "exsa"}, + "assets": { + "data": { + "href": "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb" + } + }, + } + ] + } + + +def test_get_ratings_mocked_search_and_download(requests_mock, tmp_path): + """End-to-end happy path with mocked STAC search + RDB download.""" + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json=_stub_search_response(), + ) + requests_mock.get( + "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb", + text=_SAMPLE_RDB, + ) + + out = get_ratings( + monitoring_location_id="USGS-01104475", + file_type="exsa", + file_path=str(tmp_path), + ) + assert "USGS-01104475.exsa.rdb" in out + df = out["USGS-01104475.exsa.rdb"] + assert isinstance(df, pd.DataFrame) + assert {"INDEP", "DEP"}.issubset(df.columns) + assert len(df) == 3 + + # Server-side filter should pin the single requested file_type. + sent = requests_mock.request_history[0] + qs = parse_qs(urlsplit(sent.url).query) + assert "file_type = 'exsa'" in qs["filter"][0] + assert "monitoring_location_id IN ('USGS-01104475')" in qs["filter"][0] + + +def test_get_ratings_download_and_parse_false_returns_features(requests_mock): + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json=_stub_search_response(), + ) + features = get_ratings( + monitoring_location_id="USGS-01104475", + download_and_parse=False, + ) + assert isinstance(features, list) + assert features[0]["id"] == "USGS-01104475.exsa.rdb" + + +def test_get_ratings_multi_type_filters_urls_locally(requests_mock, tmp_path): + """File_type list: server filter omits it; URL filtering is local.""" + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json={ + "features": [ + { + "id": "USGS-X.exsa.rdb", + "properties": {"file_type": "exsa"}, + "assets": {"data": {"href": "https://x.example/X.exsa.rdb"}}, + }, + { + "id": "USGS-X.base.rdb", + "properties": {"file_type": "base"}, + "assets": {"data": {"href": "https://x.example/X.base.rdb"}}, + }, + { + "id": "USGS-X.corr.rdb", + "properties": {"file_type": "corr"}, + "assets": {"data": {"href": "https://x.example/X.corr.rdb"}}, + }, + ] + }, + ) + # Only mock the two URLs we expect to be downloaded. + requests_mock.get("https://x.example/X.exsa.rdb", text=_SAMPLE_RDB) + requests_mock.get("https://x.example/X.corr.rdb", text=_SAMPLE_RDB) + + out = get_ratings( + monitoring_location_id="USGS-X", + file_type=["exsa", "corr"], + file_path=str(tmp_path), + ) + assert set(out) == {"USGS-X.exsa.rdb", "USGS-X.corr.rdb"} + + # Server-side filter must NOT include file_type for multi-type requests. + search_req = requests_mock.request_history[0] + qs = parse_qs(urlsplit(search_req.url).query) + assert "file_type" not in qs["filter"][0] From 630cc8c2d93df69d9bb46c8276bbb4422629506c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 10:26:21 -0500 Subject: [PATCH 2/7] get_ratings: surface RDB header as df.attrs and document the nwis dep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two non-functional follow-ups suggested during review of #269: (1) Document the cross-module reach into nwis._read_rdb. Rating files use the same USGS RDB shape as NWIS responses, so the parser is already reusable as-is — no refactor of the legacy nwis module is needed. Added a comment at the import site explaining why the private import is intentional and what to watch for if _read_rdb ever moves. (2) Surface the RDB #-prefixed header block. Each parsed rating frame now carries provenance in df.attrs: - df.attrs["comment"]: the list of "#"-prefixed header lines (rating id, parameter, expansion type, last-shifted timestamp, warnings, etc.). - df.attrs["url"]: the asset URL it was fetched from. R's read_waterdata_ratings exposes the comment block via comment(df); pandas's standard `attrs` dict is the Python equivalent. Done in ratings.py only — does not touch nwis. A live spot-check against api.waterdata.usgs.gov on USGS-01104475 exsa shows the 31-line USGS header survives intact (gauge name, parameter code, rating expansion, etc.). One new unit test pins the behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/ratings.py | 30 +++++++++++++++++++++++++++--- tests/waterdata_ratings_test.py | 25 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index 813fa592..6e58f1e9 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -24,6 +24,11 @@ import pandas as pd import requests +# Rating files use the same USGS RDB shape as NWIS responses (comment +# block prefixed with ``#``, header row, format-spec row, then tab-separated +# data), so we reuse the parser already in ``nwis``. ``_read_rdb`` is private; +# if it ever moves or its contract changes we want a loud failure here, hence +# the explicit import rather than a copy. from dataretrieval.nwis import _read_rdb from .utils import BASE_URL, _default_headers, _format_api_dates @@ -84,6 +89,18 @@ def _search( return response.json().get("features", []) +def _extract_rdb_comment(rdb: str) -> list[str]: + """Return the RDB ``#``-prefixed comment block as a list of header lines. + + The comment block carries useful per-rating metadata — rating id, + parameter description, expansion type, last-shifted timestamp, etc. + R's ``read_waterdata_ratings`` exposes this via ``comment(df)``; we + attach it to ``df.attrs["comment"]`` so callers can inspect or log + provenance without re-reading the on-disk RDB. + """ + return [line for line in rdb.splitlines() if line.startswith("#")] + + def _download_and_parse( feature: dict[str, Any], file_path: str, @@ -100,7 +117,10 @@ def _download_and_parse( with open(target, "w") as f: f.write(response.text) - return _read_rdb(response.text) + df = _read_rdb(response.text) + df.attrs["comment"] = _extract_rdb_comment(response.text) + df.attrs["url"] = url + return df def get_ratings( @@ -168,8 +188,12 @@ def get_ratings( dict[str, pandas.DataFrame] or list[dict] When ``download_and_parse=True`` (the default), a dict keyed by feature ID (e.g. ``"USGS-01104475.exsa.rdb"``) mapping to a parsed - ``DataFrame``. When ``download_and_parse=False``, the raw list of - STAC feature dicts as returned by the search endpoint. + ``DataFrame``. Each frame carries provenance in + ``df.attrs["comment"]`` (the RDB ``#``-prefixed header lines, like + rating id, parameter, last-shifted timestamp) and + ``df.attrs["url"]`` (the asset URL it was fetched from). When + ``download_and_parse=False``, the raw list of STAC feature dicts + as returned by the search endpoint. Raises ------ diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py index 5962b363..ae17c19b 100644 --- a/tests/waterdata_ratings_test.py +++ b/tests/waterdata_ratings_test.py @@ -100,6 +100,31 @@ def test_get_ratings_mocked_search_and_download(requests_mock, tmp_path): assert "monitoring_location_id IN ('USGS-01104475')" in qs["filter"][0] +def test_get_ratings_attaches_rdb_comment_and_url(requests_mock, tmp_path): + """Each parsed frame should carry its RDB header + source URL in df.attrs.""" + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json=_stub_search_response(), + ) + asset_url = ( + "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb" + ) + requests_mock.get(asset_url, text=_SAMPLE_RDB) + + out = get_ratings( + monitoring_location_id="USGS-01104475", + file_type="exsa", + file_path=str(tmp_path), + ) + df = out["USGS-01104475.exsa.rdb"] + # The fixture has two `# ...` lines at the top; both should land in attrs. + assert df.attrs["comment"] == [ + "# header line one", + "# header line two", + ] + assert df.attrs["url"] == asset_url + + def test_get_ratings_download_and_parse_false_returns_features(requests_mock): requests_mock.get( "https://api.waterdata.usgs.gov/stac/v0/search", From 2650f509e483d650671c0aea95d2a47177d32f9a Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 10:39:40 -0500 Subject: [PATCH 3/7] Address /simplify findings on get_ratings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six fixes from a multi-agent review pass on PR #269: 1. Narrow the per-feature except handler from a bare Exception to (RequestException, ValueError, OSError). The previous catch-all would swallow programming bugs (KeyError on a malformed feature, AttributeError, ...) and silently drop rows. 2. Escape single quotes in CQL string literals via the standard doubling rule. Most monitoring-location IDs cannot contain a quote, but the function takes arbitrary strings — defending against malformed filters and potential injection regardless. New _quote_cql_str helper, applied to monitoring_location_id and file_type. New unit test pins the behaviour. 3. Promote file_type to a Literal["exsa", "base", "corr"] and derive _VALID_FILE_TYPES from it via typing.get_args, so the runtime guard and the type hint can never drift apart. 4. Rename the `datetime` parameter to `time` to match the convention used by every sibling waterdata getter (and to stop shadowing the stdlib `datetime` module). The parameter is still passed through as the STAC `datetime` query string under the hood; that's now documented explicitly. 5. Switch the multi-type local filter from a substring check on the asset URL to feature["properties"]["file_type"]. Substring matching on URLs would false-match if a host or path ever contained one of the literal type names; STAC features carry the typed property already. 6. Skip the on-disk RDB write when file_path is None. tempfile.mkdtemp leaks (no automatic cleanup), and df.attrs["url"] already records the source — so by default we now return only the parsed frame. Users who want a local copy can pass file_path=...; the contract for that path is unchanged. While here, dropped a "discrete analogue to the OGC waterdata getters" sentence from the module docstring (it was internal-architecture WHAT-narrating, not user-relevant guidance). All 11 ratings tests pass (one new test for the CQL quote-escaping). Live verification on USGS-01104475 confirms file_path=None still returns the parsed frame with df.attrs populated, multi-type via property filter returns the expected 4 tables, and the renamed `time=` parameter still drives the STAC datetime filter. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/ratings.py | 110 ++++++++++++++++------------- tests/waterdata_ratings_test.py | 12 +++- 2 files changed, 69 insertions(+), 53 deletions(-) diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index 6e58f1e9..ecb7375f 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -6,10 +6,6 @@ service overview at https://api.waterdata.usgs.gov/docs/stac/ and the WDFN announcement at https://waterdata.usgs.gov/blog/wdfn-rating-curves/. -This is the discrete analogue to the OGC waterdata getters; it lives in -its own module because the transport layer (STAC search + RDB download) -differs from the OGC collections used by the rest of the package. - The R analogue is ``read_waterdata_ratings`` in https://github.com/DOI-USGS/dataRetrieval/. """ @@ -18,8 +14,7 @@ import logging import os -import tempfile -from typing import Any +from typing import Any, Literal, get_args import pandas as pd import requests @@ -36,7 +31,19 @@ logger = logging.getLogger(__name__) STAC_URL = f"{BASE_URL}/stac/v0" -_VALID_FILE_TYPES: tuple[str, ...] = ("exsa", "base", "corr") + +RATING_FILE_TYPE = Literal["exsa", "base", "corr"] +_VALID_FILE_TYPES = get_args(RATING_FILE_TYPE) + + +def _quote_cql_str(value: str) -> str: + """Escape a string for inclusion in a single-quoted CQL literal. + + CQL escapes a single quote by doubling it. Most monitoring-location IDs + can never contain a quote, but the function accepts arbitrary strings, + so we defend against malformed filters / injection regardless. + """ + return value.replace("'", "''") def _build_filter( @@ -47,7 +54,7 @@ def _build_filter( Mirrors R's logic: only pin ``file_type`` when a single value was given, so a multi-type request returns every matching site and the file-type - filtering happens client-side from the per-feature URLs. + filtering happens client-side from the per-feature properties. """ parts: list[str] = [] if monitoring_location_id is not None: @@ -56,16 +63,16 @@ def _build_filter( if isinstance(monitoring_location_id, str) else list(monitoring_location_id) ) - joined = "', '".join(ids) + joined = "', '".join(_quote_cql_str(i) for i in ids) parts.append(f"monitoring_location_id IN ('{joined}')") if file_type is not None: - parts.append(f"file_type = '{file_type}'") + parts.append(f"file_type = '{_quote_cql_str(file_type)}'") return " AND ".join(parts) if parts else None def _search( filter_str: str | None, - datetime_str: str | None, + time_str: str | None, bbox: list[float] | None, limit: int, ssl_check: bool, @@ -74,8 +81,8 @@ def _search( params: dict[str, Any] = {"limit": limit} if filter_str is not None: params["filter"] = filter_str - if datetime_str is not None: - params["datetime"] = datetime_str + if time_str is not None: + params["datetime"] = time_str if bbox is not None: params["bbox"] = ",".join(str(b) for b in bbox) @@ -103,19 +110,19 @@ def _extract_rdb_comment(rdb: str) -> list[str]: def _download_and_parse( feature: dict[str, Any], - file_path: str, + file_path: str | None, ssl_check: bool, ) -> pd.DataFrame: - """Fetch the feature's data asset, write it to ``file_path``, parse RDB.""" + """Fetch the feature's data asset, parse RDB, optionally persist to disk.""" url = feature["assets"]["data"]["href"] fid = feature["id"] - target = os.path.join(file_path, fid) response = requests.get(url, headers=_default_headers(), verify=ssl_check) response.raise_for_status() - with open(target, "w") as f: - f.write(response.text) + if file_path is not None: + with open(os.path.join(file_path, fid), "w") as f: + f.write(response.text) df = _read_rdb(response.text) df.attrs["comment"] = _extract_rdb_comment(response.text) @@ -125,9 +132,9 @@ def _download_and_parse( def get_ratings( monitoring_location_id: str | list[str] | None = None, - file_type: str | list[str] = "exsa", + file_type: RATING_FILE_TYPE | list[RATING_FILE_TYPE] = "exsa", file_path: str | None = None, - datetime: str | list[str] | None = None, + time: str | list[str] | None = None, bbox: list[float] | None = None, limit: int = 10000, download_and_parse: bool = True, @@ -157,18 +164,19 @@ def get_ratings( One or more identifiers in ``AGENCY-ID`` form (e.g. ``"USGS-01104475"``). If omitted, the spatial / temporal filters determine the result set. - file_type : string or list of strings, default ``"exsa"`` - Which rating file(s) to request. One or more of ``"exsa"``, - ``"base"``, ``"corr"``. + file_type : ``"exsa"``, ``"base"``, ``"corr"``, or a list, default ``"exsa"`` + Which rating file(s) to request. file_path : string, optional - Directory the downloaded RDB files are written to. Defaults to a - per-call temporary directory created via :func:`tempfile.mkdtemp`. - datetime : string or list of strings, optional - STAC ``datetime`` filter — a single date / datetime, or an - interval (``"start/end"``, optionally half-bounded with ``..``). - ISO 8601 *durations* (``"P1M"``, ``"PT36H"``, …) are **not** - supported by the rating-curve service; passing one raises - ``ValueError``. + Directory the downloaded RDB files are written to. If ``None`` + (the default), the parsed ``DataFrame`` is returned without + persisting the bytes to disk; ``df.attrs["url"]`` still records + where each rating came from. + time : string or list of strings, optional + STAC ``datetime`` filter (passed through verbatim under that name) + — a single date / datetime, or an interval (``"start/end"``, + optionally half-bounded with ``..``). ISO 8601 *durations* + (``"P1M"``, ``"PT36H"``, …) are **not** supported by the + rating-curve service; passing one raises ``ValueError``. bbox : list of numbers, optional Only features whose geometry intersects the bounding box are selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 @@ -199,7 +207,7 @@ def get_ratings( ------ ValueError For an unrecognized ``file_type`` value or an ISO 8601 duration in - ``datetime``. + ``time``. Examples -------- @@ -224,10 +232,10 @@ def get_ratings( ... download_and_parse=False, ... ) - >>> # Restrict to features modified since seven days ago (no durations) + >>> # Restrict to features in a date range (durations not supported) >>> features = dataretrieval.waterdata.get_ratings( ... bbox=[-95.0, 40.0, -92.0, 42.0], - ... datetime=["2026-04-29", ".."], + ... time=["2026-04-29", ".."], ... download_and_parse=False, ... ) @@ -239,46 +247,46 @@ def get_ratings( f"Invalid file_type {invalid!r}. Valid options: {list(_VALID_FILE_TYPES)}." ) - if datetime is not None: + if time is not None: # The rating-curve STAC service rejects ISO 8601 durations; surface a # clear error rather than letting the server return a confusing 4xx. - dt_values = datetime if isinstance(datetime, list) else [datetime] - if any(v is not None and "P" in str(v).upper() for v in dt_values): + time_values = time if isinstance(time, list) else [time] + if any(v is not None and "P" in str(v).upper() for v in time_values): raise ValueError( "ISO 8601 durations (e.g. 'P7D') are not supported in " - "`datetime` for the rating-curve service. Provide a date or " + "`time` for the rating-curve service. Provide a date or " "interval instead." ) - datetime_str = _format_api_dates(datetime, date=False) + time_str = _format_api_dates(time, date=False) else: - datetime_str = None + time_str = None # Mirror R: only pin file_type in the server-side filter when one type - # is requested. With multiple types, fetch all and filter URLs locally. + # is requested. With multiple types, fetch all and filter locally. server_file_type = file_types[0] if len(file_types) == 1 else None filter_str = _build_filter(monitoring_location_id, server_file_type) - features = _search(filter_str, datetime_str, bbox, limit, ssl_check) + features = _search(filter_str, time_str, bbox, limit, ssl_check) if not download_and_parse: return features - if file_path is None: - file_path = tempfile.mkdtemp(prefix="dataretrieval-ratings-") - os.makedirs(file_path, exist_ok=True) + if file_path is not None: + os.makedirs(file_path, exist_ok=True) out: dict[str, pd.DataFrame] = {} + requested = set(file_types) for feature in features: - url = feature.get("assets", {}).get("data", {}).get("href", "") - # Skip features whose file type wasn't requested (only relevant when - # `file_type` is a list — single-type requests are already filtered - # server-side). - if not any(ft in url for ft in file_types): + # Multi-type requests skip the server-side file_type filter, so + # filter here on the per-feature property (more reliable than + # substring-matching the URL). + feat_type = feature.get("properties", {}).get("file_type") + if feat_type not in requested: continue fid = feature["id"] try: out[fid] = _download_and_parse(feature, file_path, ssl_check) - except Exception as e: + except (requests.RequestException, ValueError, OSError) as e: logger.warning("Failed to download / parse %s: %s", fid, e) return out diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py index ae17c19b..096e9b61 100644 --- a/tests/waterdata_ratings_test.py +++ b/tests/waterdata_ratings_test.py @@ -35,15 +35,23 @@ def test_get_ratings_rejects_invalid_file_type(): get_ratings(monitoring_location_id="USGS-01104475", file_type="bogus") -def test_get_ratings_rejects_iso_8601_duration_in_datetime(): +def test_get_ratings_rejects_iso_8601_duration_in_time(): """STAC ratings doesn't accept ISO 8601 durations; surface a clear error.""" with pytest.raises(ValueError, match="durations.*not supported"): get_ratings( monitoring_location_id="USGS-01104475", - datetime="P7D", + time="P7D", ) +def test_build_filter_escapes_quotes(): + """Defends against malformed CQL or injection if an ID contains a quote.""" + from dataretrieval.waterdata.ratings import _build_filter + + f = _build_filter("USGS-x'-y", None) + assert f == "monitoring_location_id IN ('USGS-x''-y')" + + _SAMPLE_RDB = """\ # header line one # header line two From ad60f67c9928fee8527a681a10b5655f2e7a5c21 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 12:56:44 -0500 Subject: [PATCH 4/7] Tighten get_ratings: idioms, abstraction, readability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eight small changes; net -16 lines. No behavior change other than the duration check now using the project's anchored regex instead of an ad-hoc substring search (small correctness improvement). 1. Reorder so the public `get_ratings` is at the top and helpers follow — matches the convention in `nearest.py`. 2. Reuse `_DURATION_RE` from `waterdata.utils` instead of the ad-hoc `"P" in str(v).upper()` check. The regex is anchored at start and case-correct; the substring check was both too loose (matched any string containing 'p'/'P') and case-wrong (ISO 8601 P/PT are case-sensitive — uppercasing the input would have masked lowercase-p false positives). 3. Extract `_as_list(x)` for the `[x] if isinstance(x, str) else list(x)` pattern, which appeared three times (file_type normalization, monitoring_location_id in `_build_filter`, time values in the duration check). 4. Drop redundant `date=False` on the `_format_api_dates` call (it's the default). 5. Collapse the `if time is not None: ... else: time_str = None` four-line block into a single ternary. 6. Pre-filter `features` by file_type before the download loop, so the loop body is just download-and-handle-errors. The "skip features whose file type wasn't requested" comment goes away because the list-comprehension above is self-documenting. 7. Drop two narrating comments whose WHY is already captured by the helpers' docstrings (`_build_filter` and `_extract_rdb_comment`). 8. Rename `test_get_ratings_multi_type_filters_urls_locally` → `..._filters_via_property` — the test name still claimed URL filtering after the property-based switch in the previous /simplify pass. All 11 ratings tests pass; live verification on api.waterdata.usgs.gov confirms file_path=None still attaches df.attrs, multi-type filtering returns the expected four tables, and the duration check still rejects "P7D" while accepting "2026-04-29". Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/ratings.py | 237 +++++++++++++---------------- tests/waterdata_ratings_test.py | 4 +- 2 files changed, 112 insertions(+), 129 deletions(-) diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index ecb7375f..d40e7dcb 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -14,19 +14,17 @@ import logging import os -from typing import Any, Literal, get_args +from typing import Any, Iterable, Literal, get_args import pandas as pd import requests -# Rating files use the same USGS RDB shape as NWIS responses (comment -# block prefixed with ``#``, header row, format-spec row, then tab-separated -# data), so we reuse the parser already in ``nwis``. ``_read_rdb`` is private; -# if it ever moves or its contract changes we want a loud failure here, hence -# the explicit import rather than a copy. +# Rating files use the same USGS RDB shape as NWIS responses, so we reuse +# the parser already in ``nwis``. ``_read_rdb`` is private; if it ever +# moves we want a loud failure here, hence the explicit import. from dataretrieval.nwis import _read_rdb -from .utils import BASE_URL, _default_headers, _format_api_dates +from .utils import _DURATION_RE, BASE_URL, _default_headers, _format_api_dates logger = logging.getLogger(__name__) @@ -36,100 +34,6 @@ _VALID_FILE_TYPES = get_args(RATING_FILE_TYPE) -def _quote_cql_str(value: str) -> str: - """Escape a string for inclusion in a single-quoted CQL literal. - - CQL escapes a single quote by doubling it. Most monitoring-location IDs - can never contain a quote, but the function accepts arbitrary strings, - so we defend against malformed filters / injection regardless. - """ - return value.replace("'", "''") - - -def _build_filter( - monitoring_location_id: str | list[str] | None, - file_type: str | None, -) -> str | None: - """Compose the CQL filter sent to STAC ``/search``. - - Mirrors R's logic: only pin ``file_type`` when a single value was given, - so a multi-type request returns every matching site and the file-type - filtering happens client-side from the per-feature properties. - """ - parts: list[str] = [] - if monitoring_location_id is not None: - ids = ( - [monitoring_location_id] - if isinstance(monitoring_location_id, str) - else list(monitoring_location_id) - ) - joined = "', '".join(_quote_cql_str(i) for i in ids) - parts.append(f"monitoring_location_id IN ('{joined}')") - if file_type is not None: - parts.append(f"file_type = '{_quote_cql_str(file_type)}'") - return " AND ".join(parts) if parts else None - - -def _search( - filter_str: str | None, - time_str: str | None, - bbox: list[float] | None, - limit: int, - ssl_check: bool, -) -> list[dict[str, Any]]: - """Run a single STAC ``/search`` request and return its features.""" - params: dict[str, Any] = {"limit": limit} - if filter_str is not None: - params["filter"] = filter_str - if time_str is not None: - params["datetime"] = time_str - if bbox is not None: - params["bbox"] = ",".join(str(b) for b in bbox) - - response = requests.get( - f"{STAC_URL}/search", - params=params, - headers=_default_headers(), - verify=ssl_check, - ) - response.raise_for_status() - return response.json().get("features", []) - - -def _extract_rdb_comment(rdb: str) -> list[str]: - """Return the RDB ``#``-prefixed comment block as a list of header lines. - - The comment block carries useful per-rating metadata — rating id, - parameter description, expansion type, last-shifted timestamp, etc. - R's ``read_waterdata_ratings`` exposes this via ``comment(df)``; we - attach it to ``df.attrs["comment"]`` so callers can inspect or log - provenance without re-reading the on-disk RDB. - """ - return [line for line in rdb.splitlines() if line.startswith("#")] - - -def _download_and_parse( - feature: dict[str, Any], - file_path: str | None, - ssl_check: bool, -) -> pd.DataFrame: - """Fetch the feature's data asset, parse RDB, optionally persist to disk.""" - url = feature["assets"]["data"]["href"] - fid = feature["id"] - - response = requests.get(url, headers=_default_headers(), verify=ssl_check) - response.raise_for_status() - - if file_path is not None: - with open(os.path.join(file_path, fid), "w") as f: - f.write(response.text) - - df = _read_rdb(response.text) - df.attrs["comment"] = _extract_rdb_comment(response.text) - df.attrs["url"] = url - return df - - def get_ratings( monitoring_location_id: str | list[str] | None = None, file_type: RATING_FILE_TYPE | list[RATING_FILE_TYPE] = "exsa", @@ -240,29 +144,22 @@ def get_ratings( ... ) """ - file_types = [file_type] if isinstance(file_type, str) else list(file_type) + file_types = _as_list(file_type) invalid = [ft for ft in file_types if ft not in _VALID_FILE_TYPES] if invalid: raise ValueError( - f"Invalid file_type {invalid!r}. Valid options: {list(_VALID_FILE_TYPES)}." + f"Invalid file_type {invalid!r}; " + f"valid options are {list(_VALID_FILE_TYPES)}." + ) + + if time is not None and any(_DURATION_RE.match(str(v)) for v in _as_list(time)): + raise ValueError( + "ISO 8601 durations (e.g. 'P7D') are not supported in `time` " + "for the rating-curve service. Provide a date or interval instead." ) + time_str = _format_api_dates(time) if time is not None else None - if time is not None: - # The rating-curve STAC service rejects ISO 8601 durations; surface a - # clear error rather than letting the server return a confusing 4xx. - time_values = time if isinstance(time, list) else [time] - if any(v is not None and "P" in str(v).upper() for v in time_values): - raise ValueError( - "ISO 8601 durations (e.g. 'P7D') are not supported in " - "`time` for the rating-curve service. Provide a date or " - "interval instead." - ) - time_str = _format_api_dates(time, date=False) - else: - time_str = None - - # Mirror R: only pin file_type in the server-side filter when one type - # is requested. With multiple types, fetch all and filter locally. + # Mirror R: pin file_type server-side only when one type is requested. server_file_type = file_types[0] if len(file_types) == 1 else None filter_str = _build_filter(monitoring_location_id, server_file_type) @@ -271,18 +168,16 @@ def get_ratings( if not download_and_parse: return features + requested = set(file_types) + matching = [ + f for f in features if f.get("properties", {}).get("file_type") in requested + ] + if file_path is not None: os.makedirs(file_path, exist_ok=True) out: dict[str, pd.DataFrame] = {} - requested = set(file_types) - for feature in features: - # Multi-type requests skip the server-side file_type filter, so - # filter here on the per-feature property (more reliable than - # substring-matching the URL). - feat_type = feature.get("properties", {}).get("file_type") - if feat_type not in requested: - continue + for feature in matching: fid = feature["id"] try: out[fid] = _download_and_parse(feature, file_path, ssl_check) @@ -290,3 +185,91 @@ def get_ratings( logger.warning("Failed to download / parse %s: %s", fid, e) return out + + +def _as_list(x: str | Iterable[str]) -> list[str]: + """Normalize a string or iterable-of-strings to a list.""" + return [x] if isinstance(x, str) else list(x) + + +def _quote_cql_str(value: str) -> str: + """Escape a single-quoted CQL literal by doubling embedded quotes. + + Defends against malformed filters / injection on arbitrary user input, + even though valid USGS monitoring-location IDs cannot contain a quote. + """ + return value.replace("'", "''") + + +def _build_filter( + monitoring_location_id: str | list[str] | None, + file_type: str | None, +) -> str | None: + """Compose the CQL filter sent to STAC ``/search``. + + Returns ``None`` when neither argument constrains the search. + """ + parts: list[str] = [] + if monitoring_location_id is not None: + ids = _as_list(monitoring_location_id) + joined = "', '".join(_quote_cql_str(i) for i in ids) + parts.append(f"monitoring_location_id IN ('{joined}')") + if file_type is not None: + parts.append(f"file_type = '{_quote_cql_str(file_type)}'") + return " AND ".join(parts) if parts else None + + +def _search( + filter_str: str | None, + time_str: str | None, + bbox: list[float] | None, + limit: int, + ssl_check: bool, +) -> list[dict[str, Any]]: + """Run a single STAC ``/search`` request and return its features.""" + params: dict[str, Any] = {"limit": limit} + if filter_str is not None: + params["filter"] = filter_str + if time_str is not None: + params["datetime"] = time_str + if bbox is not None: + params["bbox"] = ",".join(map(str, bbox)) + + response = requests.get( + f"{STAC_URL}/search", + params=params, + headers=_default_headers(), + verify=ssl_check, + ) + response.raise_for_status() + return response.json().get("features", []) + + +def _extract_rdb_comment(rdb: str) -> list[str]: + """Return the RDB ``#``-prefixed comment block as a list of header lines. + + Carries useful per-rating provenance — rating id, parameter description, + expansion type, last-shifted timestamp, etc. R's ``read_waterdata_ratings`` + exposes the equivalent via ``comment(df)``. + """ + return [line for line in rdb.splitlines() if line.startswith("#")] + + +def _download_and_parse( + feature: dict[str, Any], + file_path: str | None, + ssl_check: bool, +) -> pd.DataFrame: + """Fetch the feature's data asset, parse RDB, optionally persist to disk.""" + url = feature["assets"]["data"]["href"] + response = requests.get(url, headers=_default_headers(), verify=ssl_check) + response.raise_for_status() + + if file_path is not None: + with open(os.path.join(file_path, feature["id"]), "w") as f: + f.write(response.text) + + df = _read_rdb(response.text) + df.attrs["comment"] = _extract_rdb_comment(response.text) + df.attrs["url"] = url + return df diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py index 096e9b61..88a0c639 100644 --- a/tests/waterdata_ratings_test.py +++ b/tests/waterdata_ratings_test.py @@ -146,8 +146,8 @@ def test_get_ratings_download_and_parse_false_returns_features(requests_mock): assert features[0]["id"] == "USGS-01104475.exsa.rdb" -def test_get_ratings_multi_type_filters_urls_locally(requests_mock, tmp_path): - """File_type list: server filter omits it; URL filtering is local.""" +def test_get_ratings_multi_type_filters_via_property(requests_mock, tmp_path): + """File_type list: server filter omits it; local filter reads the property.""" requests_mock.get( "https://api.waterdata.usgs.gov/stac/v0/search", json={ From 6dc6b64dfe496dd0f6c33938d84aafd3ebb1da70 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 13:01:44 -0500 Subject: [PATCH 5/7] Address /simplify (third pass) ruff advisory findings on PR #269 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two test-only fixes from a broader-ruff (--select ALL) pass. Both already-passed-CI changes are advisory under the project's narrower rule set (F, E, W, I, UP, B, Q, SIM, TID, C90, E501) but worth applying as cheap correctness/cleanliness wins. 1. RUF043: prefix the pytest.raises(match=...) pattern with `r` so the `.*` metacharacters are explicit. Functionally equivalent here (pytest re.searches the message), but the raw string makes the regex intent unambiguous. 2. PLC0415: remove a redundant inline `from dataretrieval.waterdata.ratings import _build_filter` inside test_build_filter_escapes_quotes — `_build_filter` is already imported at module top (left over from before the top-level import landed in an earlier commit on this branch). Other broader-ruff findings reviewed and intentionally skipped as either project-wide patterns or non-actionable: - S113 (`requests.get` without `timeout=`): no other call in the codebase uses one — applying here only would be inconsistent. - PLW1514 (`open(..., "w")` without `encoding=`): same project- wide situation. RDB content is ASCII. - PLR0913 / PLR0917 (8 args on get_ratings): mirrors R's read_waterdata_ratings signature; can't reduce without breaking parity. - FBT001 / FBT002 (boolean positional args download_and_parse / ssl_check): match the rest of waterdata's public surface; reordering would be a breaking API change. - FURB103 / PTH103 / PTH118 / PTH123 (pathlib over os.path): the rest of the codebase uses os.path; consistency wins. - EM101 / EM102 / TRY003 (raise ValueError(msg) with msg pre-assigned): every other `raise ValueError` in waterdata uses the inline f-string; consistency wins. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_ratings_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py index 88a0c639..fcead65d 100644 --- a/tests/waterdata_ratings_test.py +++ b/tests/waterdata_ratings_test.py @@ -37,7 +37,7 @@ def test_get_ratings_rejects_invalid_file_type(): def test_get_ratings_rejects_iso_8601_duration_in_time(): """STAC ratings doesn't accept ISO 8601 durations; surface a clear error.""" - with pytest.raises(ValueError, match="durations.*not supported"): + with pytest.raises(ValueError, match=r"durations.*not supported"): get_ratings( monitoring_location_id="USGS-01104475", time="P7D", @@ -46,8 +46,6 @@ def test_get_ratings_rejects_iso_8601_duration_in_time(): def test_build_filter_escapes_quotes(): """Defends against malformed CQL or injection if an ID contains a quote.""" - from dataretrieval.waterdata.ratings import _build_filter - f = _build_filter("USGS-x'-y", None) assert f == "monitoring_location_id IN ('USGS-x''-y')" From ec3b21581788e27b7d5966dda8de4fc415abd69d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 13:40:45 -0500 Subject: [PATCH 6/7] Extract RDB parsing into a shared dataretrieval.rdb module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The USGS RDB tab-separated format is used both by NWIS web services and by the new Water Data STAC catalog's rating-curve assets. Until now, only nwis.py had a parser, and waterdata/ratings.py reached across to import the private nwis._read_rdb. That smell is gone: - New top-level module dataretrieval.rdb with two public helpers: - read_rdb(text, dtypes=None): pure parser. Strips the comment block, tab-parses the header row, skips the format-spec row, and returns a DataFrame. Forwards optional caller-supplied dtype hints to pandas.read_csv (unknown column names are silently ignored, so dicts are safe to pass on any RDB). Raises ValueError on HTML responses. - extract_rdb_comment(text): returns the #-prefixed header block as a list of lines (the equivalent of R's comment(df)). - nwis._read_rdb is now a thin wrapper around rdb.read_rdb that applies the NWIS-specific dtype hints (site_no, dec_long_va, ...) and runs format_response() for datetime indexing. The private symbol's contract is unchanged; existing 30 nwis tests pass. Dropped the now-unused StringIO import. - waterdata.ratings switches its import from nwis._read_rdb to rdb.read_rdb / extract_rdb_comment, dropping the local _extract_rdb_comment helper. The cross-module-private-import comment block goes away — the dependency is now a clean same-package import. 7 new unit tests pin the shared parser's behavior: basic shape parsing, format-spec-row skipping, dtype passthrough, all-comments empty-result, HTML rejection, and the comment extractor. Net stat: +177 / -72. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/nwis.py | 76 +++++++------------------ dataretrieval/rdb.py | 89 ++++++++++++++++++++++++++++++ dataretrieval/waterdata/ratings.py | 19 +------ tests/rdb_test.py | 65 ++++++++++++++++++++++ 4 files changed, 177 insertions(+), 72 deletions(-) create mode 100644 dataretrieval/rdb.py create mode 100644 tests/rdb_test.py diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 0bcb1d68..b9d7fdd5 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -7,12 +7,12 @@ from __future__ import annotations import warnings -from io import StringIO from json import JSONDecodeError import pandas as pd import requests +from dataretrieval.rdb import read_rdb as _read_rdb_text from dataretrieval.utils import BaseMetadata from .utils import query @@ -1017,65 +1017,29 @@ def _read_json(json): return merged_df -def _read_rdb(rdb): - """ - Convert NWIS rdb table into a ``pandas.dataframe``. +# NWIS-specific column dtype hints; pandas silently ignores unknown +# names, so passing the dict to read_rdb is safe even on responses +# whose columns don't include any of these. +_NWIS_RDB_DTYPES = { + "site_no": str, + "dec_long_va": float, + "dec_lat_va": float, + "parm_cd": str, + "parameter_cd": str, +} - Parameters - ---------- - rdb: string - A string representation of an rdb table - Returns - ------- - df: ``pandas.dataframe`` - A formatted pandas data frame +def _read_rdb(rdb): + """Parse an NWIS RDB response and apply NWIS-specific post-processing. + Thin wrapper around :func:`dataretrieval.rdb.read_rdb` that adds the + NWIS column-dtype hints and runs :func:`format_response` (datetime + index, multi-site MultiIndex, optional GeoDataFrame). """ - if "" in rdb.lower() or "" in rdb.lower(): - raise ValueError( - "Received HTML response instead of RDB. This often indicates " - "that the service has been moved or is currently unavailable." - ) - - count = 0 - lines = rdb.splitlines() - - for line in lines: - # ignore comment lines - if line.startswith("#"): - count = count + 1 - - else: - break - - if count >= len(lines): - # All lines are comments — the service returned no data rows (e.g. - # "No sites found matching all criteria"). This is a legitimate empty - # result, so return an empty DataFrame rather than raising. - return pd.DataFrame() - - fields = lines[count].split("\t") - fields = [field.replace(",", "").strip() for field in fields if field.strip()] - dtypes = { - "site_no": str, - "dec_long_va": float, - "dec_lat_va": float, - "parm_cd": str, - "parameter_cd": str, - } - - df = pd.read_csv( - StringIO(rdb), - delimiter="\t", - skiprows=count + 2, - names=fields, - na_values="NaN", - dtype=dtypes, - ) - - df = format_response(df) - return df + df = _read_rdb_text(rdb, dtypes=_NWIS_RDB_DTYPES) + if df.empty: + return df + return format_response(df) def _check_sites_value_types(sites): diff --git a/dataretrieval/rdb.py b/dataretrieval/rdb.py new file mode 100644 index 00000000..a8cbf48d --- /dev/null +++ b/dataretrieval/rdb.py @@ -0,0 +1,89 @@ +"""Format-agnostic parser for the USGS RDB tab-separated text format. + +RDB (Relational DataBase) is a USGS-internal text format used by NWIS web +services and by the Water Data STAC catalog's rating-curve assets. Every +RDB file has the same shape: + +- One or more ``#``-prefixed comment lines carrying provenance metadata + (data source, retrieval timestamp, station name, parameter codes, etc.). +- A tab-separated header row naming each column. +- A second tab-separated row giving column format specs (e.g. ``5s 15s``); + it is informational only and skipped during parsing. +- Tab-separated data rows. + +This module exposes the parsing primitives that both ``dataretrieval.nwis`` +and ``dataretrieval.waterdata.ratings`` use. Callers layer their own +post-processing (NWIS-specific datetime indexing, ratings-specific +``df.attrs`` provenance, etc.) on top of the raw frame. +""" + +from __future__ import annotations + +from io import StringIO + +import pandas as pd + + +def read_rdb(rdb: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: + """Parse an RDB text response into a ``pandas.DataFrame``. + + Parameters + ---------- + rdb : str + The RDB text response from a USGS web service. + dtypes : dict[str, type] or None, optional + Optional column-name to dtype hints, forwarded to + ``pandas.read_csv``. Unknown column names are silently ignored, so + callers may safely pass a dict of all columns they might be + interested in. + + Returns + ------- + pandas.DataFrame + The parsed data. An RDB consisting only of comment lines (e.g. a + "no sites found" response) returns an empty DataFrame rather than + raising. + + Raises + ------ + ValueError + If the response body looks like HTML, which usually means the + service has been moved, is degraded, or returned an error page. + """ + if "" in rdb.lower() or "" in rdb.lower(): + raise ValueError( + "Received HTML response instead of RDB. This often indicates " + "that the service has been moved or is currently unavailable." + ) + + lines = rdb.splitlines() + header_idx = next( + (i for i, line in enumerate(lines) if not line.startswith("#")), + len(lines), + ) + if header_idx >= len(lines): + # All lines are comments — a legitimate empty result. + return pd.DataFrame() + + fields = lines[header_idx].split("\t") + fields = [f.replace(",", "").strip() for f in fields if f.strip()] + + return pd.read_csv( + StringIO(rdb), + delimiter="\t", + skiprows=header_idx + 2, # +1 for header, +1 for the format-spec row + names=fields, + na_values="NaN", + dtype=dtypes, + ) + + +def extract_rdb_comment(rdb: str) -> list[str]: + """Return the RDB ``#``-prefixed comment block as a list of header lines. + + The comment block carries provenance metadata that is otherwise lost + during parsing — data source, retrieval timestamp, parameter codes, + rating id and last-shifted timestamp for ratings, etc. R's + ``dataRetrieval`` exposes the equivalent via ``comment(df)``. + """ + return [line for line in rdb.splitlines() if line.startswith("#")] diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index d40e7dcb..a1d0a3bb 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -19,10 +19,7 @@ import pandas as pd import requests -# Rating files use the same USGS RDB shape as NWIS responses, so we reuse -# the parser already in ``nwis``. ``_read_rdb`` is private; if it ever -# moves we want a loud failure here, hence the explicit import. -from dataretrieval.nwis import _read_rdb +from dataretrieval.rdb import extract_rdb_comment, read_rdb from .utils import _DURATION_RE, BASE_URL, _default_headers, _format_api_dates @@ -245,16 +242,6 @@ def _search( return response.json().get("features", []) -def _extract_rdb_comment(rdb: str) -> list[str]: - """Return the RDB ``#``-prefixed comment block as a list of header lines. - - Carries useful per-rating provenance — rating id, parameter description, - expansion type, last-shifted timestamp, etc. R's ``read_waterdata_ratings`` - exposes the equivalent via ``comment(df)``. - """ - return [line for line in rdb.splitlines() if line.startswith("#")] - - def _download_and_parse( feature: dict[str, Any], file_path: str | None, @@ -269,7 +256,7 @@ def _download_and_parse( with open(os.path.join(file_path, feature["id"]), "w") as f: f.write(response.text) - df = _read_rdb(response.text) - df.attrs["comment"] = _extract_rdb_comment(response.text) + df = read_rdb(response.text) + df.attrs["comment"] = extract_rdb_comment(response.text) df.attrs["url"] = url return df diff --git a/tests/rdb_test.py b/tests/rdb_test.py new file mode 100644 index 00000000..99f46ff5 --- /dev/null +++ b/tests/rdb_test.py @@ -0,0 +1,65 @@ +import pandas as pd +import pytest + +from dataretrieval.rdb import extract_rdb_comment, read_rdb + +# A minimally complete RDB: comment block, header row, format-spec row, +# data rows. Both NWIS responses and ratings RDBs share this shape. +_BASIC_RDB = """\ +# header line one +# header line two +agency_cd\tsite_no\tINDEP\tDEP +5s\t15s\t10n\t10n +USGS\t01104475\t0.10\t0.0 +USGS\t01104475\t0.20\t0.5 +USGS\t01104475\t0.30\t1.2 +""" + + +def test_read_rdb_parses_basic_shape(): + df = read_rdb(_BASIC_RDB) + assert list(df.columns) == ["agency_cd", "site_no", "INDEP", "DEP"] + assert len(df) == 3 + assert df["INDEP"].tolist() == [0.10, 0.20, 0.30] + + +def test_read_rdb_skips_format_spec_row(): + """The "5s 15s 10n 10n" row is metadata, not data.""" + df = read_rdb(_BASIC_RDB) + # If the format-spec row had been treated as data, df would have 4 rows + # and "5s" / "15s" would appear in the parsed values. + assert "5s" not in df["agency_cd"].tolist() + + +def test_read_rdb_dtype_hints_applied(): + """Caller-supplied dtype hints are forwarded to pandas; unknown names ignored.""" + df = read_rdb(_BASIC_RDB, dtypes={"site_no": str, "DEP": float, "unknown": int}) + # Without the str hint, pandas would parse "01104475" as int and drop the + # leading zero. Check the values, not the dtype name (which varies across + # pandas versions: object, StringDtype, etc.). + assert df["site_no"].iloc[0] == "01104475" + assert df["DEP"].dtype == float + + +def test_read_rdb_empty_when_only_comments(): + """All-comments input is a legitimate "no data" response, not an error.""" + df = read_rdb("# only a comment\n# and another\n") + assert isinstance(df, pd.DataFrame) + assert df.empty + + +def test_read_rdb_raises_on_html_response(): + """If the service returns an HTML error page, surface it loudly.""" + with pytest.raises(ValueError, match="HTML"): + read_rdb("Service Unavailable") + with pytest.raises(ValueError, match="HTML"): + read_rdb("\n...") + + +def test_extract_rdb_comment_returns_only_hash_lines(): + comments = extract_rdb_comment(_BASIC_RDB) + assert comments == ["# header line one", "# header line two"] + + +def test_extract_rdb_comment_empty_when_no_comments(): + assert extract_rdb_comment("a\tb\n1\t2\n") == [] From ad1486a28fd33ce1ae91ac958e1bcdfb4ac96384 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 6 May 2026 13:50:48 -0500 Subject: [PATCH 7/7] Address /simplify findings on the rdb refactor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aggregated quality-review fixes; no behavior change. Tests: 7 rdb + 11 ratings + 28 nwis pass (was 30 nwis; -2 are pure-duplicate coverage now lived in rdb_test.py). Source: - dataretrieval/rdb.py: rename `rdb` parameter -> `text` on both read_rdb and extract_rdb_comment ("rdb" is the format, not the thing; reads better as `read_rdb(text=...)`). Drop "USGS-internal" from the module docstring (the format is publicly served). Folded the two-pass `[f.replace ... for f in fields if f.strip()]` into one comprehension + a single empty-string filter. Replaced the defensive `>=` on the all-comments guard with `==` (it's exact after `next(..., len(lines))`). Spelled out in extract_rdb_comment's docstring that the returned lines retain their leading `#` and whitespace, matching R. - dataretrieval/nwis.py: drop the `read_rdb as _read_rdb_text` import alias — the local name is too close to the wrapper `_read_rdb` and confused readers. Move `_NWIS_RDB_DTYPES` from near the bottom of the file to the top alongside `_CRS`, where module-level constants live. Drop the narrating "pandas silently ignores unknown names..." comment (already in the read_rdb docstring). Drop the `if df.empty: return df` guard in the wrapper — `format_response()` already short-circuits cleanly on an empty frame (no `dec_lat_va` -> skip GeoDataFrame branch; no `datetime` -> early return). Tests: - tests/nwis_test.py: drop the two pure-duplicate cases (`test_valid_rdb_returns_dataframe` and `test_all_comments_returns_empty_dataframe`) whose coverage now lives in tests/rdb_test.py. Keep and refocus the remaining `test_no_sites_flows_through_format_response` to exercise the wrapper-specific contract (now load-bearing after dropping the empty-frame guard): an empty parser result must still flow through format_response without raising. Issue-#171 regression intent preserved in the docstring. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/nwis.py | 27 +++++++++--------------- dataretrieval/rdb.py | 39 +++++++++++++++++----------------- tests/nwis_test.py | 49 +++++++++++++------------------------------ 3 files changed, 45 insertions(+), 70 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index b9d7fdd5..ec8d2537 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -12,7 +12,7 @@ import pandas as pd import requests -from dataretrieval.rdb import read_rdb as _read_rdb_text +from dataretrieval.rdb import read_rdb from dataretrieval.utils import BaseMetadata from .utils import query @@ -44,6 +44,14 @@ # NAD83 _CRS = "EPSG:4269" +_NWIS_RDB_DTYPES = { + "site_no": str, + "dec_long_va": float, + "dec_lat_va": float, + "parm_cd": str, + "parameter_cd": str, +} + def _parse_json_or_raise(response: requests.Response) -> pd.DataFrame: """Parse a JSON NWIS response, raising a helpful error on HTML responses.""" @@ -1017,18 +1025,6 @@ def _read_json(json): return merged_df -# NWIS-specific column dtype hints; pandas silently ignores unknown -# names, so passing the dict to read_rdb is safe even on responses -# whose columns don't include any of these. -_NWIS_RDB_DTYPES = { - "site_no": str, - "dec_long_va": float, - "dec_lat_va": float, - "parm_cd": str, - "parameter_cd": str, -} - - def _read_rdb(rdb): """Parse an NWIS RDB response and apply NWIS-specific post-processing. @@ -1036,10 +1032,7 @@ def _read_rdb(rdb): NWIS column-dtype hints and runs :func:`format_response` (datetime index, multi-site MultiIndex, optional GeoDataFrame). """ - df = _read_rdb_text(rdb, dtypes=_NWIS_RDB_DTYPES) - if df.empty: - return df - return format_response(df) + return format_response(read_rdb(rdb, dtypes=_NWIS_RDB_DTYPES)) def _check_sites_value_types(sites): diff --git a/dataretrieval/rdb.py b/dataretrieval/rdb.py index a8cbf48d..2b52656b 100644 --- a/dataretrieval/rdb.py +++ b/dataretrieval/rdb.py @@ -1,8 +1,8 @@ -"""Format-agnostic parser for the USGS RDB tab-separated text format. +"""Parser for the USGS RDB tab-separated text format. -RDB (Relational DataBase) is a USGS-internal text format used by NWIS web -services and by the Water Data STAC catalog's rating-curve assets. Every -RDB file has the same shape: +RDB (Relational DataBase) is the text format used by NWIS web services +and by the Water Data STAC catalog's rating-curve assets. Every RDB +file has the same shape: - One or more ``#``-prefixed comment lines carrying provenance metadata (data source, retrieval timestamp, station name, parameter codes, etc.). @@ -24,12 +24,12 @@ import pandas as pd -def read_rdb(rdb: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: +def read_rdb(text: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: """Parse an RDB text response into a ``pandas.DataFrame``. Parameters ---------- - rdb : str + text : str The RDB text response from a USGS web service. dtypes : dict[str, type] or None, optional Optional column-name to dtype hints, forwarded to @@ -50,26 +50,26 @@ def read_rdb(rdb: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: If the response body looks like HTML, which usually means the service has been moved, is degraded, or returned an error page. """ - if "" in rdb.lower() or "" in rdb.lower(): + if "" in text.lower() or "" in text.lower(): raise ValueError( "Received HTML response instead of RDB. This often indicates " "that the service has been moved or is currently unavailable." ) - lines = rdb.splitlines() + lines = text.splitlines() header_idx = next( (i for i, line in enumerate(lines) if not line.startswith("#")), len(lines), ) - if header_idx >= len(lines): + if header_idx == len(lines): # All lines are comments — a legitimate empty result. return pd.DataFrame() - fields = lines[header_idx].split("\t") - fields = [f.replace(",", "").strip() for f in fields if f.strip()] + fields = [f.replace(",", "").strip() for f in lines[header_idx].split("\t")] + fields = [f for f in fields if f] return pd.read_csv( - StringIO(rdb), + StringIO(text), delimiter="\t", skiprows=header_idx + 2, # +1 for header, +1 for the format-spec row names=fields, @@ -78,12 +78,13 @@ def read_rdb(rdb: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: ) -def extract_rdb_comment(rdb: str) -> list[str]: - """Return the RDB ``#``-prefixed comment block as a list of header lines. +def extract_rdb_comment(text: str) -> list[str]: + """Return the RDB ``#``-prefixed comment block, raw and in original order. - The comment block carries provenance metadata that is otherwise lost - during parsing — data source, retrieval timestamp, parameter codes, - rating id and last-shifted timestamp for ratings, etc. R's - ``dataRetrieval`` exposes the equivalent via ``comment(df)``. + Each entry includes its leading ``#`` and any whitespace, matching what + R's ``dataRetrieval`` returns from ``comment(df)``. The comment block + carries provenance metadata that is otherwise lost during parsing — + data source, retrieval timestamp, parameter codes, rating id and + last-shifted timestamp for ratings, etc. """ - return [line for line in rdb.splitlines() if line.startswith("#")] + return [line for line in text.splitlines() if line.startswith("#")] diff --git a/tests/nwis_test.py b/tests/nwis_test.py index c52775a4..a42ba509 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -325,43 +325,24 @@ def test_variable_info_deprecated(self): class TestReadRdb: - """Tests for the _read_rdb helper. + """Tests for the NWIS-specific _read_rdb wrapper. - Notes - ----- - Related to GitHub Issue #171. + The format-agnostic parser is exercised in tests/rdb_test.py; this + class pins the wrapper-specific contract — that an empty parser + result flows through format_response without crashing (issue #171). """ - # Minimal valid RDB response with one data row - _VALID_RDB = "# comment\nsite_no\tvalue\n5s\t10n\n01491000\t42\n" - - # NWIS response when no sites match the query criteria - _NO_SITES_RDB = ( - "# //Output-Format: RDB\n" - "# //Response-Status: OK\n" - "# //Response-Message: No sites found matching all criteria\n" - ) - - def test_valid_rdb_returns_dataframe(self): - """_read_rdb returns a DataFrame for a well-formed RDB response.""" - df = _read_rdb(self._VALID_RDB) - assert isinstance(df, pd.DataFrame) - assert "site_no" in df.columns - - def test_no_sites_returns_empty_dataframe(self): - """_read_rdb returns an empty DataFrame when NWIS finds no matching sites. - - A "No sites found" response is a legitimate empty result, not an error, - so callers can check ``df.empty`` rather than catching an exception. - Regression test for issue #171 (previously raised IndexError). + def test_no_sites_flows_through_format_response(self): + """A "No sites found" response is a legitimate empty result, not an + error, so callers can check ``df.empty`` rather than catching an + exception. Regression for issue #171 (previously raised IndexError), + which now also covers the empty-frame path through ``format_response``. """ - df = _read_rdb(self._NO_SITES_RDB) - assert isinstance(df, pd.DataFrame) - assert df.empty - - def test_all_comments_returns_empty_dataframe(self): - """_read_rdb returns an empty DataFrame when the response has only comments.""" - rdb = "# just a comment\n# another comment\n" - df = _read_rdb(rdb) + no_sites_rdb = ( + "# //Output-Format: RDB\n" + "# //Response-Status: OK\n" + "# //Response-Message: No sites found matching all criteria\n" + ) + df = _read_rdb(no_sites_rdb) assert isinstance(df, pd.DataFrame) assert df.empty