From 4442ed3c3fad00e2a4d5cc84d34cbe452aa3a9fd Mon Sep 17 00:00:00 2001
From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com>
Date: Tue, 12 May 2026 20:38:20 +1000
Subject: [PATCH] Silence autoscaler empty-prom error

---
 CHANGELOG.md                 | 18 ++++++++++++++++++
 fly.autoscaler-analysis.toml |  3 ++-
 fly.autoscaler-worker.toml   |  9 ++++++++-
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf8aa426..2717f014 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,24 @@ On merge, CI will:
 
 ## [Unreleased]
 
+### Fixed
+
+- `fly-autoscaler` no longer logs
+  `metrics collection failed: empty prometheus result` once a minute on both
+  `hover-autoscaler-worker` and `hover-autoscaler-analysis`. The broker gauges
+  (`bee_broker_stream_length`, `bee_broker_scheduled_zset_depth`) are
+  synchronous OTel `Int64Gauge`s, which only emit when `Record()` lands inside a
+  collect interval; during idle the series goes stale in Fly's managed
+  Prometheus and the autoscaler's PromQL returns no result. The autoscaler
+  queries now wrap with `or on() vector(0)` so an empty result collapses to zero
+  rather than erroring. Scaling behaviour is unchanged at idle (the existing
+  `max(1, …)` floor already kept a single machine running). Trade-off documented
+  inline: a true Redis outage now reads `0` instead of producing a series gap,
+  so the autoscaler scales to `MIN=1` rather than holding count — acceptable
+  because idle workers can't crawl during an outage anyway and restart cleanly
+  once Redis recovers. The full fix (async observable gauges) is tracked in a
+  follow-up issue.
+
 ### Security
 
 - Bump `github.com/jackc/pgx/v5` from v5.7.6 to v5.9.2 to resolve a
diff --git a/fly.autoscaler-analysis.toml b/fly.autoscaler-analysis.toml
index 430d3b58..9e34d876 100644
--- a/fly.autoscaler-analysis.toml
+++ b/fly.autoscaler-analysis.toml
@@ -21,7 +21,8 @@ primary_region = "syd"
   # metric is emitted with app=hover-worker. We're scaling hover-analysis
   # based on metrics emitted by hover-worker — that's intentional, and the
   # filter must match the emitter, not the target.
-  FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"lighthouse\"})"
+  # `or on() vector(0)` — see fly.autoscaler-worker.toml for rationale.
+  FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"lighthouse\"}) or on() vector(0)"
 
   # 25 lighthouse tasks per machine before a scale-up trigger. Cap at 10.
   # Sized off observed audit durations (p50 ~30s, p90 ~65s) so a single
diff --git a/fly.autoscaler-worker.toml b/fly.autoscaler-worker.toml
index 8d292146..f2258cd7 100644
--- a/fly.autoscaler-worker.toml
+++ b/fly.autoscaler-worker.toml
@@ -29,7 +29,14 @@ primary_region = "syd"
   # [metrics] block in fly.worker.toml. Token is FlyV1 readonly.
   FAS_PROMETHEUS_ADDRESS = "https://api.fly.io/prometheus/personal"
   FAS_PROMETHEUS_METRIC_NAME = "worker_backlog"
-  FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"worker\"}) + sum(bee_broker_scheduled_zset_depth{app=\"hover-worker\"})"
+  # `or on() vector(0)` collapses an empty result to zero so fly-autoscaler
+  # doesn't log `empty prometheus result` whenever the broker gauges go
+  # stale (sync OTel Int64Gauges only emit on Record, so idle ticks produce
+  # series gaps). The trade-off: a real Redis outage now reads 0 instead of
+  # gapping — autoscaler scales to MIN=1 rather than holding count. Idle
+  # workers can't crawl during an outage anyway, and they restart cleanly
+  # once Redis recovers.
+  FAS_PROMETHEUS_QUERY = "(sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"worker\"}) + sum(bee_broker_scheduled_zset_depth{app=\"hover-worker\"})) or on() vector(0)"
 
   # Worker autoscaling is plumbed but effectively dormant. The crawl
   # workers are I/O-bound and per-job concurrency is bounded by