diff --git a/CHANGELOG.md b/CHANGELOG.md index e5009349..2717f014 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,24 @@ On merge, CI will: ## [Unreleased] +### Fixed + +- `fly-autoscaler` no longer logs + `metrics collection failed: empty prometheus result` once a minute on both + `hover-autoscaler-worker` and `hover-autoscaler-analysis`. The broker gauges + (`bee_broker_stream_length`, `bee_broker_scheduled_zset_depth`) are + synchronous OTel `Int64Gauge`s, which only emit when `Record()` lands inside a + collect interval; during idle the series goes stale in Fly's managed + Prometheus and the autoscaler's PromQL returns no result. The autoscaler + queries now wrap with `or on() vector(0)` so an empty result collapses to zero + rather than erroring. Scaling behaviour is unchanged at idle (the existing + `max(1, …)` floor already kept a single machine running). Trade-off documented + inline: a true Redis outage now reads `0` instead of producing a series gap, + so the autoscaler scales to `MIN=1` rather than holding count — acceptable + because idle workers can't crawl during an outage anyway and restart cleanly + once Redis recovers. The full fix (async observable gauges) is tracked in a + follow-up issue. + ### Security - Bump `github.com/jackc/pgx/v5` from v5.7.6 to v5.9.2 to resolve a @@ -39,6 +57,19 @@ On merge, CI will: ## Full changelog history +## [0.34.13] – 2026-05-12 + +### Fixed + +- App, worker, and analysis binaries no longer `Fatal` on the first Redis `PING` + failure at startup. The ping is now wrapped in a bounded retry loop (30 s + total, 3 s per attempt, capped exponential backoff) so the binary rides out + the Upstash-on-Fly cold-start window that briefly closes connections with EOF + on freshly-provisioned review apps. Production behaviour is unchanged — a + healthy Redis still succeeds on the first attempt and persistent + misconfiguration still fails fast. Resolves the recurring EOF burst on every + PR preview deploy (Sentry: HOVER-JX, HOVER-MD, HOVER-JZ). + ## [0.34.12] – 2026-05-12 ### Changed diff --git a/cmd/analysis/main.go b/cmd/analysis/main.go index edb96626..102c98fc 100644 --- a/cmd/analysis/main.go +++ b/cmd/analysis/main.go @@ -95,7 +95,7 @@ func main() { analysisLog.Fatal("failed to create Redis client", "error", err) } defer redisClient.Close() - if err := redisClient.Ping(context.Background()); err != nil { + if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil { analysisLog.Fatal("failed to ping Redis", "error", err) } analysisLog.Info("connected to Redis") diff --git a/cmd/app/main.go b/cmd/app/main.go index fcd9105c..355a554c 100644 --- a/cmd/app/main.go +++ b/cmd/app/main.go @@ -547,7 +547,7 @@ func main() { redisClient = client defer redisClient.Close() - if err := redisClient.Ping(context.Background()); err != nil { + if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil { startupLog.Fatal("failed to ping Redis", "error", err) } startupLog.Info("connected to Redis") diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 811eb629..d1c5d29d 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -115,7 +115,7 @@ func main() { } defer redisClient.Close() - if err := redisClient.Ping(context.Background()); err != nil { + if err := redisClient.PingWithRetry(context.Background(), 30*time.Second, 3*time.Second); err != nil { workerLog.Fatal("failed to ping Redis", "error", err) } workerLog.Info("connected to Redis") diff --git a/fly.autoscaler-analysis.toml b/fly.autoscaler-analysis.toml index 430d3b58..9e34d876 100644 --- a/fly.autoscaler-analysis.toml +++ b/fly.autoscaler-analysis.toml @@ -21,7 +21,8 @@ primary_region = "syd" # metric is emitted with app=hover-worker. We're scaling hover-analysis # based on metrics emitted by hover-worker — that's intentional, and the # filter must match the emitter, not the target. - FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"lighthouse\"})" + # `or on() vector(0)` — see fly.autoscaler-worker.toml for rationale. + FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"lighthouse\"}) or on() vector(0)" # 25 lighthouse tasks per machine before a scale-up trigger. Cap at 10. # Sized off observed audit durations (p50 ~30s, p90 ~65s) so a single diff --git a/fly.autoscaler-worker.toml b/fly.autoscaler-worker.toml index 8d292146..f2258cd7 100644 --- a/fly.autoscaler-worker.toml +++ b/fly.autoscaler-worker.toml @@ -29,7 +29,14 @@ primary_region = "syd" # [metrics] block in fly.worker.toml. Token is FlyV1 readonly. FAS_PROMETHEUS_ADDRESS = "https://api.fly.io/prometheus/personal" FAS_PROMETHEUS_METRIC_NAME = "worker_backlog" - FAS_PROMETHEUS_QUERY = "sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"worker\"}) + sum(bee_broker_scheduled_zset_depth{app=\"hover-worker\"})" + # `or on() vector(0)` collapses an empty result to zero so fly-autoscaler + # doesn't log `empty prometheus result` whenever the broker gauges go + # stale (sync OTel Int64Gauges only emit on Record, so idle ticks produce + # series gaps). The trade-off: a real Redis outage now reads 0 instead of + # gapping — autoscaler scales to MIN=1 rather than holding count. Idle + # workers can't crawl during an outage anyway, and they restart cleanly + # once Redis recovers. + FAS_PROMETHEUS_QUERY = "(sum(bee_broker_stream_length{app=\"hover-worker\",stream_type=\"worker\"}) + sum(bee_broker_scheduled_zset_depth{app=\"hover-worker\"})) or on() vector(0)" # Worker autoscaling is plumbed but effectively dormant. The crawl # workers are I/O-bound and per-job concurrency is bounded by diff --git a/internal/broker/redis.go b/internal/broker/redis.go index 053a9dad..f14d92d2 100644 --- a/internal/broker/redis.go +++ b/internal/broker/redis.go @@ -77,6 +77,68 @@ func (c *Client) Ping(ctx context.Context) error { return c.rdb.Ping(ctx).Err() } +// PingWithRetry retries Ping with capped exponential backoff until the +// total budget elapses. Each attempt uses its own perAttempt timeout so +// a hung server can't stall the loop. Returns nil on first success or +// the last error if the budget is exhausted. Exists because Upstash-on-Fly +// review apps occasionally close the first PING with EOF during cold +// start, which used to Fatal the binary on boot (see HOVER-JX/MD/JZ). +func (c *Client) PingWithRetry(ctx context.Context, total, perAttempt time.Duration) error { + return pingWithRetry(ctx, total, perAttempt, c.Ping) +} + +func pingWithRetry(ctx context.Context, total, perAttempt time.Duration, ping func(context.Context) error) error { + deadline := time.Now().Add(total) + backoff := 100 * time.Millisecond + const maxBackoff = 2 * time.Second + + var lastErr error + for { + remaining := time.Until(deadline) + if remaining <= 0 { + if lastErr != nil { + return lastErr + } + return context.DeadlineExceeded + } + + attemptTimeout := perAttempt + if attemptTimeout > remaining { + attemptTimeout = remaining + } + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) + err := ping(attemptCtx) + cancel() + if err == nil { + return nil + } + lastErr = err + + if ctx.Err() != nil { + return ctx.Err() + } + + sleep := backoff + if remaining := time.Until(deadline); sleep > remaining { + sleep = remaining + } + if sleep <= 0 { + return lastErr + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(sleep): + } + + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + } +} + func (c *Client) Close() error { return c.rdb.Close() } diff --git a/internal/broker/redis_test.go b/internal/broker/redis_test.go index 82c8bb75..cd72bfcc 100644 --- a/internal/broker/redis_test.go +++ b/internal/broker/redis_test.go @@ -2,14 +2,74 @@ package broker import ( "context" + "errors" "strings" "testing" + "time" "github.com/redis/go-redis/v9" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) +// TestPingWithRetry covers the helper that wraps Ping with a bounded +// retry loop. It uses a stub ping function so the test stays +// hermetic and fast — no miniredis, no sleeps longer than the +// helper's first backoff (100 ms). +func TestPingWithRetry(t *testing.T) { + t.Run("immediate success", func(t *testing.T) { + var calls int + err := pingWithRetry(context.Background(), time.Second, 100*time.Millisecond, + func(context.Context) error { calls++; return nil }) + require.NoError(t, err) + assert.Equal(t, 1, calls) + }) + + t.Run("succeeds after transient errors", func(t *testing.T) { + var calls int + err := pingWithRetry(context.Background(), 2*time.Second, 100*time.Millisecond, + func(context.Context) error { + calls++ + if calls < 3 { + return errors.New("EOF") + } + return nil + }) + require.NoError(t, err) + assert.Equal(t, 3, calls) + }) + + t.Run("exhausts budget returning last error", func(t *testing.T) { + want := errors.New("still EOF") + err := pingWithRetry(context.Background(), 250*time.Millisecond, 50*time.Millisecond, + func(context.Context) error { return want }) + require.ErrorIs(t, err, want) + }) + + t.Run("aborts on context cancellation", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + err := pingWithRetry(ctx, time.Second, 100*time.Millisecond, + func(context.Context) error { return errors.New("boom") }) + require.ErrorIs(t, err, context.Canceled) + }) + + // Guards against a regression where perAttempt > total let the + // final attempt run past the overall budget. The ping function + // blocks until its per-attempt context expires, so without the + // clamp the call would take perAttempt rather than total. + t.Run("clamps attempt timeout to remaining budget", func(t *testing.T) { + start := time.Now() + err := pingWithRetry(context.Background(), 80*time.Millisecond, time.Second, + func(ctx context.Context) error { + <-ctx.Done() + return ctx.Err() + }) + require.Error(t, err) + assert.LessOrEqual(t, time.Since(start), 250*time.Millisecond) + }) +} + // TestClient_ClearAll seeds every prefix the broker owns plus an // unrelated key, then asserts ClearAll wipes only the hover:* keys. func TestClient_ClearAll(t *testing.T) {