From 4442ed3c3fad00e2a4d5cc84d34cbe452aa3a9fd Mon Sep 17 00:00:00 2001 From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com> Date: Tue, 12 May 2026 20:38:20 +1000 Subject: [PATCH] Silence autoscaler empty-prom error --- CHANGELOG.md | 18 ++++++++++++++++++ fly.autoscaler-analysis.toml | 3 ++- fly.autoscaler-worker.toml | 9 ++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf8aa426..2717f014 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,24 @@ On merge, CI will: ## [Unreleased] +### Fixed + +- `fly-autoscaler` no longer logs + `metrics collection failed: empty prometheus result` once a minute on both + `hover-autoscaler-worker` and `hover-autoscaler-analysis`. The broker gauges + (`bee_broker_stream_length`, `bee_broker_scheduled_zset_depth`) are + synchronous OTel `Int64Gauge`s, which only emit when `Record()` lands inside a + collect interval; during idle the series goes stale in Fly's managed + Prometheus and the autoscaler's PromQL returns no result. The autoscaler + queries now wrap with `or on() vector(0)` so an empty result collapses to zero + rather than erroring. Scaling behaviour is unchanged at idle (the existing + `max(1, …)` floor already kept a single machine running). Trade-off documented + inline: a true Redis outage now reads `0` instead of producing a series gap, + so the autoscaler scales to `MIN=1` rather than holding count — acceptable + because idle workers can't crawl during an outage anyway and restart cleanly + once Redis recovers. The full fix (async observable gauges) is tracked in a + follow-up issue. + ### Security - Bump `github.com/jackc/pgx/v5` from v5.7.6 to v5.9.2 to resolve a diff --git a/fly.autoscaler-analysis.toml b/fly.autoscaler-analysis.toml index 430d3b58..9e34d876 100644 --- a/fly.autoscaler-analysis.toml +++ b/fly.autoscaler-analysis.toml @@ -21,7 +21,8 @@ primary_region = "syd" # metric is emitted with app=hover-worker. We're scaling hover-analysis # based on metrics emitted by hover-worker — that's intentional, and the # filter must match the emitter, not the target. - FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"lighthouse\"})" + # `or on() vector(0)` — see fly.autoscaler-worker.toml for rationale. + FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"lighthouse\"}) or on() vector(0)" # 25 lighthouse tasks per machine before a scale-up trigger. Cap at 10. # Sized off observed audit durations (p50 ~30s, p90 ~65s) so a single diff --git a/fly.autoscaler-worker.toml b/fly.autoscaler-worker.toml index 8d292146..f2258cd7 100644 --- a/fly.autoscaler-worker.toml +++ b/fly.autoscaler-worker.toml @@ -29,7 +29,14 @@ primary_region = "syd" # [metrics] block in fly.worker.toml. Token is FlyV1 readonly. FAS_PROMETHEUS_ADDRESS = "https://api.fly.io/prometheus/personal" FAS_PROMETHEUS_METRIC_NAME = "worker_backlog" - FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"worker\"}) + sum(bee_broker_scheduled_zset_depth{app=\"hover-worker\"})" + # `or on() vector(0)` collapses an empty result to zero so fly-autoscaler + # doesn't log `empty prometheus result` whenever the broker gauges go + # stale (sync OTel Int64Gauges only emit on Record, so idle ticks produce + # series gaps). The trade-off: a real Redis outage now reads 0 instead of + # gapping — autoscaler scales to MIN=1 rather than holding count. Idle + # workers can't crawl during an outage anyway, and they restart cleanly + # once Redis recovers. + FAS_PROMETHEUS_QUERY = "(sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"worker\"}) + sum(bee_broker_scheduled_zset_depth{app=\"hover-worker\"})) or on() vector(0)" # Worker autoscaling is plumbed but effectively dormant. The crawl # workers are I/O-bound and per-job concurrency is bounded by