Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved.

**05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**.

**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
Expand Down
32 changes: 21 additions & 11 deletions dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,17 @@ def _error_body(resp: requests.Response):
)


def _raise_for_non_200(resp: requests.Response) -> None:
"""Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200.

Used by both the initial-request gate and the pagination loops; without
the loop-side check, a 5xx body lacking ``numberReturned`` would be
silently treated as an empty page and pagination would stop quietly.
"""
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))


def _construct_api_requests(
service: str,
properties: list[str] | None = None,
Expand Down Expand Up @@ -612,8 +623,7 @@ def _walk_pages(
client = client or requests.Session()
try:
resp = client.send(req)
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))
_raise_for_non_200(resp)

# Store the initial response for metadata
initial_response = resp
Expand All @@ -635,11 +645,12 @@ def _walk_pages(
headers=headers,
data=content if method == "POST" else None,
)
_raise_for_non_200(resp)
dfs.append(_get_resp_data(resp, geopd=geopd))
curr_url = _next_req_url(resp)
except Exception: # noqa: BLE001
error_text = _error_body(resp)
logger.error("Request incomplete. %s", error_text)
except Exception as e: # noqa: BLE001
# `resp` may be stale (prior page) if client.request() raised.
logger.error("Request incomplete: %s", e)
logger.warning(
"Request failed for URL: %s. Data download interrupted.", curr_url
)
Expand Down Expand Up @@ -1072,8 +1083,7 @@ def get_stats_data(

try:
resp = client.send(req)
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))
_raise_for_non_200(resp)

# Store the initial response for metadata
initial_response = resp
Expand All @@ -1099,14 +1109,14 @@ def get_stats_data(
params=args,
headers=headers,
)
_raise_for_non_200(resp)
body = resp.json()
all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
next_token = body["next"]
except Exception: # noqa: BLE001
error_text = _error_body(resp)
logger.error("Request incomplete. %s", error_text)
except Exception as e: # noqa: BLE001
logger.error("Request incomplete: %s", e)
logger.warning(
"Request failed for URL: %s. Data download interrupted.", resp.url
"Request failed for URL: %s. Data download interrupted.", url
)
next_token = None

Expand Down
78 changes: 78 additions & 0 deletions tests/waterdata_utils_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from unittest import mock

import pandas as pd
Expand Down Expand Up @@ -86,6 +87,83 @@ def test_walk_pages_multiple_mocked():
assert mock_client.request.call_args[0][1] == "https://example.com/page2"


def _resp_ok(features):
"""Build a 200-OK mock response carrying the given features list."""
resp = mock.MagicMock()
resp.json.return_value = {
"numberReturned": len(features),
"features": features,
"links": [{"rel": "next", "href": "https://example.com/page2"}]
if features
else [],
}
resp.headers = {}
resp.links = {"next": {"url": "https://example.com/page2"}} if features else {}
resp.status_code = 200
resp.url = "https://example.com/page1"
return resp


def _walk_pages_with_failure(failure_resp_or_exc):
"""Run _walk_pages where page 1 succeeds and page 2 fails as given."""
resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}])

mock_client = mock.MagicMock(spec=requests.Session)
mock_client.send.return_value = resp1
if isinstance(failure_resp_or_exc, BaseException):
mock_client.request.side_effect = failure_resp_or_exc
else:
mock_client.request.return_value = failure_resp_or_exc

mock_req = mock.MagicMock(spec=requests.PreparedRequest)
mock_req.method = "GET"
mock_req.headers = {}
mock_req.url = "https://example.com/page1"

return _walk_pages(geopd=False, req=mock_req, client=mock_client)


def test_walk_pages_logs_actual_exception_when_request_raises(caplog):
"""Exception from client.request() must be logged with its actual message."""
caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils")

df, _ = _walk_pages_with_failure(requests.ConnectionError("boom"))

# First page's data is preserved (best-effort behavior).
assert list(df["val"]) == ["a"]
# Logged error mentions the actual ConnectionError, not a stale page body.
error_messages = [
r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR
]
assert any("boom" in m for m in error_messages), error_messages


def test_walk_pages_surfaces_5xx_mid_pagination(caplog):
"""A non-200 mid-pagination response must be logged, not silently swallowed."""
caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils")

page2_500 = mock.MagicMock()
page2_500.status_code = 503
page2_500.json.return_value = {
"code": "ServiceUnavailable",
"description": "upstream timeout",
}
page2_500.url = "https://example.com/page2"

df, _ = _walk_pages_with_failure(page2_500)

assert list(df["val"]) == ["a"]
error_messages = [
r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR
]
# The 5xx is now visible in the error log (it would previously have
# been silently swallowed because _get_resp_data returned an empty
# frame and the loop stopped quietly).
assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), (
error_messages
)


def test_handle_stats_nesting_tolerates_missing_drop_columns():
"""If the upstream stats response shape ever changes such that one of
the columns we try to drop ("type", "properties.data") is absent, the
Expand Down