From 13b90322b279cf80924788b850208c04b5af7569 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 8 May 2026 16:44:10 -0500 Subject: [PATCH 1/2] Surface real failures in paginated waterdata responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in _walk_pages (and the parallel get_stats_data pagination loop) caused silent or misleading failures when a paginated request was interrupted mid-walk: 1. The except handler called _error_body(resp) — but when client.request() itself raised (ConnectionError, Timeout, etc.), `resp` still pointed at the *previous successful page*. The "incomplete" log message therefore described the wrong request, and on a non-JSON 200 body would itself raise inside resp.json() and bury the original failure. 2. No status-code check was performed on paginated responses. The initial request guards `if resp.status_code != 200`, but the loop doesn't. A 5xx body that didn't include "numberReturned" was silently turned into an empty DataFrame by _get_resp_data, the `next` link wasn't found, and the loop quietly exited — handing the user truncated data with no error logged anywhere. This affects every paginated waterdata getter (get_daily, get_continuous, get_monitoring_locations, …). Fix: - Guard the loop body with `if resp.status_code != 200: raise RuntimeError(_error_body(resp))`, mirroring the initial request. - Capture the exception with `as e` and log `e` instead of touching the stale `resp`. Same change in get_stats_data. Behavior preserved: on failure the loop still logs and returns whatever pages were collected ("best effort"). The change is purely to make the failure observable and the log message accurate. Two new tests cover (a) network-exception mid-pagination and (b) 5xx mid-pagination, asserting that the actual failure surfaces in the error log. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/utils.py | 22 +++++--- tests/waterdata_utils_test.py | 87 ++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2faaeb42..548d4878 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. + **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 784f2969..7756e101 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -635,11 +635,18 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) + # Mirror the initial-request check; otherwise a 5xx body + # without "numberReturned" silently yields an empty frame + # and the loop quietly stops. + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + # Report the *actual* failure — not _error_body(resp) on a + # stale prior-page response (which would describe the wrong + # request and can itself raise on non-JSON bodies). + logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) @@ -1099,14 +1106,15 @@ def get_stats_data( params=args, headers=headers, ) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + logger.error("Request incomplete: %s", e) logger.warning( - "Request failed for URL: %s. Data download interrupted.", resp.url + "Request failed for URL: %s. Data download interrupted.", url ) next_token = None diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 8151ab37..3008be15 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -86,6 +86,93 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" +def _resp_ok(features): + """Build a 200-OK mock response carrying the given features list.""" + resp = mock.MagicMock() + resp.json.return_value = { + "numberReturned": len(features), + "features": features, + "links": [{"rel": "next", "href": "https://example.com/page2"}] + if features + else [], + } + resp.headers = {} + resp.links = {"next": {"url": "https://example.com/page2"}} if features else {} + resp.status_code = 200 + resp.url = "https://example.com/page1" + return resp + + +def _walk_pages_with_failure(failure_resp_or_exc): + """Run _walk_pages where page 1 succeeds and page 2 fails as given.""" + resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}]) + + mock_client = mock.MagicMock(spec=requests.Session) + mock_client.send.return_value = resp1 + if isinstance(failure_resp_or_exc, BaseException): + mock_client.request.side_effect = failure_resp_or_exc + else: + mock_client.request.return_value = failure_resp_or_exc + + mock_req = mock.MagicMock(spec=requests.PreparedRequest) + mock_req.method = "GET" + mock_req.headers = {} + mock_req.url = "https://example.com/page1" + + return _walk_pages(geopd=False, req=mock_req, client=mock_client) + + +def test_walk_pages_logs_actual_exception_when_request_raises(caplog): + """When client.request() itself raises (e.g. a connection error), the + logged failure must reflect that exception — not _error_body() on the + stale prior-page response, which described the wrong request and could + itself crash on non-JSON bodies.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) + + # First page's data is preserved (best-effort behavior). + assert list(df["val"]) == ["a"] + # Logged error mentions the actual ConnectionError, not a stale page body. + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + assert any("boom" in m for m in error_messages), error_messages + + +def test_walk_pages_surfaces_5xx_mid_pagination(caplog): + """A non-200 response on a paginated page must be detected; previously + the loop happily called _get_resp_data() on the 5xx body, which — + lacking 'numberReturned' — silently produced an empty DataFrame and + the loop quietly stopped with no error logged.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + page2_500 = mock.MagicMock() + page2_500.status_code = 503 + page2_500.json.return_value = { + "code": "ServiceUnavailable", + "description": "upstream timeout", + } + page2_500.url = "https://example.com/page2" + + df, _ = _walk_pages_with_failure(page2_500) + + assert list(df["val"]) == ["a"] + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + # The 5xx is now visible in the error log (it would previously have + # been silently swallowed because _get_resp_data returned an empty + # frame and the loop stopped quietly). + assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), ( + error_messages + ) + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the From 13a63fc4e030cb4b4c9d2106f75caf36debd6869 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 12 May 2026 19:48:48 -0500 Subject: [PATCH 2/2] Extract _raise_for_non_200 helper and trim PR #273 comments The status-code guard appeared four times in waterdata/utils.py (initial-request and pagination paths in both _walk_pages and get_stats_data) with bit-identical bodies. Extract into a single named helper; the helper's docstring carries the "silent-empty-frame" WHY that the inline comment was apologizing for. Also: - trim the second multi-line comment in _walk_pages to a single line noting the actual constraint (`resp` may be stale) - hoist `import logging` to module level in waterdata_utils_test.py - condense the two new test docstrings to one-line contract statements; the bug history lives in the commit message Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 30 ++++++++++++++++-------------- tests/waterdata_utils_test.py | 15 +++------------ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7756e101..c36bbe33 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -368,6 +368,17 @@ def _error_body(resp: requests.Response): ) +def _raise_for_non_200(resp: requests.Response) -> None: + """Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200. + + Used by both the initial-request gate and the pagination loops; without + the loop-side check, a 5xx body lacking ``numberReturned`` would be + silently treated as an empty page and pagination would stop quietly. + """ + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) + + def _construct_api_requests( service: str, properties: list[str] | None = None, @@ -612,8 +623,7 @@ def _walk_pages( client = client or requests.Session() try: resp = client.send(req) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) # Store the initial response for metadata initial_response = resp @@ -635,17 +645,11 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) - # Mirror the initial-request check; otherwise a 5xx body - # without "numberReturned" silently yields an empty frame - # and the loop quietly stops. - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 - # Report the *actual* failure — not _error_body(resp) on a - # stale prior-page response (which would describe the wrong - # request and can itself raise on non-JSON bodies). + # `resp` may be stale (prior page) if client.request() raised. logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url @@ -1079,8 +1083,7 @@ def get_stats_data( try: resp = client.send(req) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) # Store the initial response for metadata initial_response = resp @@ -1106,8 +1109,7 @@ def get_stats_data( params=args, headers=headers, ) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 3008be15..37d97c43 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,3 +1,4 @@ +import logging from unittest import mock import pandas as pd @@ -123,12 +124,7 @@ def _walk_pages_with_failure(failure_resp_or_exc): def test_walk_pages_logs_actual_exception_when_request_raises(caplog): - """When client.request() itself raises (e.g. a connection error), the - logged failure must reflect that exception — not _error_body() on the - stale prior-page response, which described the wrong request and could - itself crash on non-JSON bodies.""" - import logging - + """Exception from client.request() must be logged with its actual message.""" caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) @@ -143,12 +139,7 @@ def test_walk_pages_logs_actual_exception_when_request_raises(caplog): def test_walk_pages_surfaces_5xx_mid_pagination(caplog): - """A non-200 response on a paginated page must be detected; previously - the loop happily called _get_resp_data() on the 5xx body, which — - lacking 'numberReturned' — silently produced an empty DataFrame and - the loop quietly stopped with no error logged.""" - import logging - + """A non-200 mid-pagination response must be logged, not silently swallowed.""" caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") page2_500 = mock.MagicMock()