From 0e79222dd795e2980d0f3f6889a3b06c390b0b8c Mon Sep 17 00:00:00 2001
From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com>
Date: Tue, 12 May 2026 08:33:10 +1000
Subject: [PATCH 1/3] Cache robots.txt rules per domain

---
 CHANGELOG.md                       |   8 +-
 internal/jobs/manager.go           |  89 ++++++++++++-
 internal/jobs/robots_cache_test.go | 201 +++++++++++++++++++++++++++++
 3 files changed, 290 insertions(+), 8 deletions(-)
 create mode 100644 internal/jobs/robots_cache_test.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa274966..5e97614c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,7 +28,13 @@ On merge, CI will:
 
 ## [Unreleased]
 
-_Add unreleased changes here._
+### Changed
+
+- `JobManager.GetRobotsRules` now caches results per normalised domain (1h
+  positive TTL, 60s negative TTL), and collapses concurrent misses onto a single
+  origin fetch via singleflight. A long crawl previously refetched `/robots.txt`
+  every five minutes (stream worker's job-info TTL) and a 429 on `/robots.txt`
+  returned on the next read; both are now bounded.
 
 ## Full changelog history
 
diff --git a/internal/jobs/manager.go b/internal/jobs/manager.go
index c1c10504..1c5af21f 100644
--- a/internal/jobs/manager.go
+++ b/internal/jobs/manager.go
@@ -21,6 +21,7 @@ import (
 	"github.com/good-native/hover/internal/util"
 	"github.com/google/uuid"
 	"github.com/lib/pq"
+	"golang.org/x/sync/singleflight"
 )
 
 var jobsLog = logging.Component("jobs")
@@ -96,12 +97,39 @@ type JobManager struct {
 	pagesMutex     sync.RWMutex
 
 	sitemapSem chan struct{} // caps concurrent sitemap batch insertions across all jobs
+
+	// In-memory cache for robots.txt rules. Previously every job-info
+	// cache miss in the stream worker (5 min TTL) refetched /robots.txt;
+	// a long crawl hammered the origin and produced repeat 429s. Negative
+	// entries are kept too so a 429 on /robots.txt doesn't get re-hit
+	// every Release. Single-flight collapses concurrent misses for the
+	// same domain into one origin fetch.
+	robotsCache  map[string]robotsCacheEntry
+	robotsMutex  sync.RWMutex
+	robotsGroup  singleflight.Group
+	robotsTTLPos time.Duration // success TTL
+	robotsTTLNeg time.Duration // failure / nil-rules TTL
+}
+
+type robotsCacheEntry struct {
+	rules     *crawler.RobotsRules
+	err       error
+	expiresAt time.Time
 }
 
 type ProgressMilestoneCallback func(ctx context.Context, jobID string, oldPct, newPct int)
 
 type JobTerminatedCallback func(ctx context.Context, jobID string)
 
+// Defaults sized to swallow the stream worker's 5-minute job-info refresh
+// without re-hitting the origin: 1h success TTL means a long crawl fetches
+// /robots.txt at most a handful of times. Negative TTL is short so a
+// transient 429 on /robots.txt unblocks within ~60s.
+const (
+	defaultRobotsTTLPositive = time.Hour
+	defaultRobotsTTLNegative = 60 * time.Second
+)
+
 func NewJobManager(db *sql.DB, dbQueue DbQueueProvider, crawler CrawlerInterface) *JobManager {
 	sitemapConcurrency := sitemapInsertConcurrency()
 	return &JobManager{
@@ -111,6 +139,9 @@ func NewJobManager(db *sql.DB, dbQueue DbQueueProvider, crawler CrawlerInterface
 		processedPages:     make(map[string]struct{}),
 		lastMilestoneFired: make(map[string]int),
 		sitemapSem:         make(chan struct{}, sitemapConcurrency),
+		robotsCache:        make(map[string]robotsCacheEntry),
+		robotsTTLPos:       defaultRobotsTTLPositive,
+		robotsTTLNeg:       defaultRobotsTTLNegative,
 	}
 }
 
@@ -368,19 +399,63 @@ func (jm *JobManager) setupJobDatabase(ctx context.Context, job *Job, normalised
 const wafCacheTTL = 24 * time.Hour
 
 // Returns nil (no error) when the crawler is unavailable; callers treat
-// nil rules as "no restriction".
+// nil rules as "no restriction". Results are cached per normalised domain:
+// successful fetches for robotsTTLPos, errors for robotsTTLNeg. Concurrent
+// misses for the same domain collapse onto a single origin fetch via
+// singleflight so a job-info burst can't trigger N parallel /robots.txt
+// requests.
 func (jm *JobManager) GetRobotsRules(ctx context.Context, domain string) (*crawler.RobotsRules, error) {
 	if jm.crawler == nil {
 		return nil, nil
 	}
-	result, err := jm.crawler.DiscoverSitemapsAndRobots(ctx, domain)
-	if err != nil {
-		return nil, fmt.Errorf("jobs: fetch robots for %s: %w", domain, err)
+
+	// util.NormaliseDomain strips scheme/www/trailing slash but does not
+	// lowercase, so do that here to collapse case-variant inputs onto
+	// one cache entry.
+	key := strings.ToLower(util.NormaliseDomain(domain))
+	now := time.Now()
+
+	jm.robotsMutex.RLock()
+	if entry, ok := jm.robotsCache[key]; ok && now.Before(entry.expiresAt) {
+		jm.robotsMutex.RUnlock()
+		return entry.rules, entry.err
 	}
-	if result == nil {
-		return nil, nil
+	jm.robotsMutex.RUnlock()
+
+	type robotsResult struct {
+		rules *crawler.RobotsRules
+		err   error
 	}
-	return result.RobotsRules, nil
+	val, _, _ := jm.robotsGroup.Do(key, func() (any, error) {
+		result, fetchErr := jm.crawler.DiscoverSitemapsAndRobots(ctx, domain)
+		out := robotsResult{}
+		if fetchErr != nil {
+			out.err = fmt.Errorf("jobs: fetch robots for %s: %w", domain, fetchErr)
+		} else if result != nil {
+			out.rules = result.RobotsRules
+		}
+
+		ttl := jm.robotsTTLPos
+		if out.err != nil {
+			ttl = jm.robotsTTLNeg
+		}
+
+		jm.robotsMutex.Lock()
+		jm.robotsCache[key] = robotsCacheEntry{
+			rules:     out.rules,
+			err:       out.err,
+			expiresAt: time.Now().Add(ttl),
+		}
+		jm.robotsMutex.Unlock()
+
+		// singleflight.Do returns the inner error to all sharers; return
+		// nil here and propagate err via the typed payload so a context
+		// cancellation in one caller doesn't poison the cached failure.
+		return out, nil
+	})
+
+	res := val.(robotsResult)
+	return res.rules, res.err
 }
 
 func (jm *JobManager) validateRootURLAccess(ctx context.Context, job *Job, normalisedDomain string, rootPath string) (*crawler.RobotsRules, error) {
diff --git a/internal/jobs/robots_cache_test.go b/internal/jobs/robots_cache_test.go
new file mode 100644
index 00000000..72a0f60d
--- /dev/null
+++ b/internal/jobs/robots_cache_test.go
@@ -0,0 +1,201 @@
+package jobs
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/good-native/hover/internal/crawler"
+)
+
+// stubRobotsCrawler implements CrawlerInterface but only the
+// DiscoverSitemapsAndRobots method is exercised by the cache. Everything
+// else returns zero values; tests must not call them.
+type stubRobotsCrawler struct {
+	discoverFn func(ctx context.Context, domain string) (*crawler.SitemapDiscoveryResult, error)
+	calls      atomic.Int32
+}
+
+func (s *stubRobotsCrawler) DiscoverSitemapsAndRobots(ctx context.Context, domain string) (*crawler.SitemapDiscoveryResult, error) {
+	s.calls.Add(1)
+	return s.discoverFn(ctx, domain)
+}
+
+func (s *stubRobotsCrawler) WarmURL(context.Context, string, bool) (*crawler.CrawlResult, error) {
+	return nil, nil
+}
+func (s *stubRobotsCrawler) ParseSitemap(context.Context, string) ([]string, error) {
+	return nil, nil
+}
+func (s *stubRobotsCrawler) FilterURLs(urls []string, _, _ []string) []string { return urls }
+func (s *stubRobotsCrawler) GetUserAgent() string                             { return "test" }
+func (s *stubRobotsCrawler) Probe(context.Context, string) (crawler.WAFDetection, error) {
+	return crawler.WAFDetection{}, nil
+}
+
+func newJobManagerWithCrawler(c CrawlerInterface) *JobManager {
+	return &JobManager{
+		crawler:      c,
+		robotsCache:  make(map[string]robotsCacheEntry),
+		robotsTTLPos: defaultRobotsTTLPositive,
+		robotsTTLNeg: defaultRobotsTTLNegative,
+	}
+}
+
+func TestGetRobotsRules_CachesSuccessForPositiveTTL(t *testing.T) {
+	rules := &crawler.RobotsRules{CrawlDelay: 5}
+	stub := &stubRobotsCrawler{
+		discoverFn: func(context.Context, string) (*crawler.SitemapDiscoveryResult, error) {
+			return &crawler.SitemapDiscoveryResult{RobotsRules: rules}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	ctx := context.Background()
+
+	for i := 0; i < 10; i++ {
+		got, err := jm.GetRobotsRules(ctx, "example.com")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != rules {
+			t.Fatalf("expected cached rules pointer, got %v", got)
+		}
+	}
+	if c := stub.calls.Load(); c != 1 {
+		t.Fatalf("expected one origin fetch under positive cache, got %d", c)
+	}
+}
+
+func TestGetRobotsRules_CachesErrorForNegativeTTL(t *testing.T) {
+	fetchErr := errors.New("429 Too Many Requests")
+	stub := &stubRobotsCrawler{
+		discoverFn: func(context.Context, string) (*crawler.SitemapDiscoveryResult, error) {
+			return nil, fetchErr
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	ctx := context.Background()
+
+	for i := 0; i < 5; i++ {
+		_, err := jm.GetRobotsRules(ctx, "throttled.com")
+		if err == nil {
+			t.Fatalf("expected wrapped fetch error, got nil")
+		}
+		if !errors.Is(err, fetchErr) {
+			t.Fatalf("expected wrapped 429 error, got %v", err)
+		}
+	}
+	if c := stub.calls.Load(); c != 1 {
+		t.Fatalf("expected one origin fetch under negative cache, got %d", c)
+	}
+}
+
+func TestGetRobotsRules_RefetchesAfterPositiveTTL(t *testing.T) {
+	rules := &crawler.RobotsRules{}
+	stub := &stubRobotsCrawler{
+		discoverFn: func(context.Context, string) (*crawler.SitemapDiscoveryResult, error) {
+			return &crawler.SitemapDiscoveryResult{RobotsRules: rules}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	jm.robotsTTLPos = 0 // every read is a miss
+	ctx := context.Background()
+
+	for i := 0; i < 3; i++ {
+		if _, err := jm.GetRobotsRules(ctx, "expiring.com"); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+	if c := stub.calls.Load(); c != 3 {
+		t.Fatalf("expected 3 origin fetches with zero TTL, got %d", c)
+	}
+}
+
+func TestGetRobotsRules_RefetchesAfterNegativeTTLExpires(t *testing.T) {
+	var attempts atomic.Int32
+	stub := &stubRobotsCrawler{
+		discoverFn: func(context.Context, string) (*crawler.SitemapDiscoveryResult, error) {
+			if attempts.Add(1) == 1 {
+				return nil, errors.New("transient 429")
+			}
+			return &crawler.SitemapDiscoveryResult{RobotsRules: &crawler.RobotsRules{}}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	jm.robotsTTLNeg = 0 // failure entry expires immediately
+	ctx := context.Background()
+
+	if _, err := jm.GetRobotsRules(ctx, "recover.com"); err == nil {
+		t.Fatalf("expected initial failure")
+	}
+	// Negative entry is already expired — second call must refetch and succeed.
+	if _, err := jm.GetRobotsRules(ctx, "recover.com"); err != nil {
+		t.Fatalf("expected recovery on refetch, got %v", err)
+	}
+	if c := stub.calls.Load(); c != 2 {
+		t.Fatalf("expected 2 origin fetches after negative TTL expiry, got %d", c)
+	}
+}
+
+func TestGetRobotsRules_NormalisesDomain(t *testing.T) {
+	rules := &crawler.RobotsRules{}
+	stub := &stubRobotsCrawler{
+		discoverFn: func(context.Context, string) (*crawler.SitemapDiscoveryResult, error) {
+			return &crawler.SitemapDiscoveryResult{RobotsRules: rules}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	ctx := context.Background()
+
+	if _, err := jm.GetRobotsRules(ctx, "https://Example.com/"); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if _, err := jm.GetRobotsRules(ctx, "example.com"); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if c := stub.calls.Load(); c != 1 {
+		t.Fatalf("normalised keys should share cache; got %d origin fetches", c)
+	}
+}
+
+func TestGetRobotsRules_CollapsesConcurrentMisses(t *testing.T) {
+	gate := make(chan struct{})
+	stub := &stubRobotsCrawler{
+		discoverFn: func(context.Context, string) (*crawler.SitemapDiscoveryResult, error) {
+			<-gate
+			return &crawler.SitemapDiscoveryResult{RobotsRules: &crawler.RobotsRules{}}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	ctx := context.Background()
+
+	const fanout = 20
+	var wg sync.WaitGroup
+	wg.Add(fanout)
+	for i := 0; i < fanout; i++ {
+		go func() {
+			defer wg.Done()
+			_, _ = jm.GetRobotsRules(ctx, "swarm.com")
+		}()
+	}
+
+	// Wait until the discoverFn is parked on the gate, then release.
+	deadline := time.Now().Add(time.Second)
+	for time.Now().Before(deadline) && stub.calls.Load() == 0 {
+		time.Sleep(2 * time.Millisecond)
+	}
+	close(gate)
+	wg.Wait()
+
+	if c := stub.calls.Load(); c != 1 {
+		t.Fatalf("singleflight should collapse %d concurrent misses to one fetch, got %d", fanout, c)
+	}
+}
+
+// Compile-time guard that the stub satisfies CrawlerInterface; if a method
+// is added to the interface, this assertion breaks at build rather than
+// at test time.
+var _ CrawlerInterface = (*stubRobotsCrawler)(nil)

From d04f5c7b42665ef18f6a401dcedd44fed883573f Mon Sep 17 00:00:00 2001
From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com>
Date: Tue, 12 May 2026 08:54:19 +1000
Subject: [PATCH 2/3] Address CodeRabbit robots cache feedback

---
 internal/jobs/manager.go           | 43 +++++++++++++++++-----------
 internal/jobs/robots_cache_test.go | 46 ++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/internal/jobs/manager.go b/internal/jobs/manager.go
index 1c5af21f..48c0531d 100644
--- a/internal/jobs/manager.go
+++ b/internal/jobs/manager.go
@@ -409,10 +409,10 @@ func (jm *JobManager) GetRobotsRules(ctx context.Context, domain string) (*crawl
 		return nil, nil
 	}
 
-	// util.NormaliseDomain strips scheme/www/trailing slash but does not
-	// lowercase, so do that here to collapse case-variant inputs onto
-	// one cache entry.
-	key := strings.ToLower(util.NormaliseDomain(domain))
+	// Lowercase first: util.NormaliseDomain uses case-sensitive
+	// TrimPrefix for the scheme/www strips, so an upper-case "HTTPS://"
+	// would otherwise survive and split the cache.
+	key := util.NormaliseDomain(strings.ToLower(domain))
 	now := time.Now()
 
 	jm.robotsMutex.RLock()
@@ -427,26 +427,37 @@ func (jm *JobManager) GetRobotsRules(ctx context.Context, domain string) (*crawl
 		err   error
 	}
 	val, _, _ := jm.robotsGroup.Do(key, func() (any, error) {
-		result, fetchErr := jm.crawler.DiscoverSitemapsAndRobots(ctx, domain)
+		// Fetch with the canonical key so the request and the cache
+		// entry agree on the origin string. Without this, a scheme- or
+		// case-variant input would fetch under the raw form while the
+		// failure outcome got stored against the normalised key.
+		result, fetchErr := jm.crawler.DiscoverSitemapsAndRobots(ctx, key)
 		out := robotsResult{}
 		if fetchErr != nil {
-			out.err = fmt.Errorf("jobs: fetch robots for %s: %w", domain, fetchErr)
+			out.err = fmt.Errorf("jobs: fetch robots for %s: %w", key, fetchErr)
 		} else if result != nil {
 			out.rules = result.RobotsRules
 		}
 
-		ttl := jm.robotsTTLPos
-		if out.err != nil {
-			ttl = jm.robotsTTLNeg
-		}
+		// Don't negative-cache caller-side cancellations: one timed-out
+		// caller would otherwise poison the shared cache for everyone
+		// else for `robotsTTLNeg`.
+		transient := out.err != nil && (errors.Is(fetchErr, context.Canceled) ||
+			errors.Is(fetchErr, context.DeadlineExceeded))
 
-		jm.robotsMutex.Lock()
-		jm.robotsCache[key] = robotsCacheEntry{
-			rules:     out.rules,
-			err:       out.err,
-			expiresAt: time.Now().Add(ttl),
+		if !transient {
+			ttl := jm.robotsTTLPos
+			if out.err != nil {
+				ttl = jm.robotsTTLNeg
+			}
+			jm.robotsMutex.Lock()
+			jm.robotsCache[key] = robotsCacheEntry{
+				rules:     out.rules,
+				err:       out.err,
+				expiresAt: time.Now().Add(ttl),
+			}
+			jm.robotsMutex.Unlock()
 		}
-		jm.robotsMutex.Unlock()
 
 		// singleflight.Do returns the inner error to all sharers; return
 		// nil here and propagate err via the typed payload so a context
diff --git a/internal/jobs/robots_cache_test.go b/internal/jobs/robots_cache_test.go
index 72a0f60d..5a6e9ea7 100644
--- a/internal/jobs/robots_cache_test.go
+++ b/internal/jobs/robots_cache_test.go
@@ -195,6 +195,52 @@ func TestGetRobotsRules_CollapsesConcurrentMisses(t *testing.T) {
 	}
 }
 
+func TestGetRobotsRules_DoesNotCacheContextCancellation(t *testing.T) {
+	var attempts atomic.Int32
+	stub := &stubRobotsCrawler{
+		discoverFn: func(_ context.Context, _ string) (*crawler.SitemapDiscoveryResult, error) {
+			if attempts.Add(1) == 1 {
+				return nil, context.Canceled
+			}
+			return &crawler.SitemapDiscoveryResult{RobotsRules: &crawler.RobotsRules{}}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+	jm.robotsTTLNeg = time.Hour // would normally suppress a refetch
+
+	// First call surfaces the cancellation error but must NOT cache it.
+	if _, err := jm.GetRobotsRules(context.Background(), "cancel.com"); !errors.Is(err, context.Canceled) {
+		t.Fatalf("expected wrapped context.Canceled, got %v", err)
+	}
+
+	// Second call must refetch and succeed — a transient cancel cannot
+	// poison the shared cache.
+	if _, err := jm.GetRobotsRules(context.Background(), "cancel.com"); err != nil {
+		t.Fatalf("expected recovery on refetch, got %v", err)
+	}
+	if c := stub.calls.Load(); c != 2 {
+		t.Fatalf("expected 2 origin fetches when cancellation is not cached, got %d", c)
+	}
+}
+
+func TestGetRobotsRules_FetchesUnderCanonicalDomain(t *testing.T) {
+	var seen string
+	stub := &stubRobotsCrawler{
+		discoverFn: func(_ context.Context, domain string) (*crawler.SitemapDiscoveryResult, error) {
+			seen = domain
+			return &crawler.SitemapDiscoveryResult{RobotsRules: &crawler.RobotsRules{}}, nil
+		},
+	}
+	jm := newJobManagerWithCrawler(stub)
+
+	if _, err := jm.GetRobotsRules(context.Background(), "HTTPS://www.Example.COM/"); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if seen != "example.com" {
+		t.Fatalf("origin fetch must use the canonical key; got %q", seen)
+	}
+}
+
 // Compile-time guard that the stub satisfies CrawlerInterface; if a method
 // is added to the interface, this assertion breaks at build rather than
 // at test time.

From 8c961059e460538d4ab526a62da0f9cefc8eb689 Mon Sep 17 00:00:00 2001
From: Simon Smallchua <40650011+simonsmallchua@users.noreply.github.com>
Date: Tue, 12 May 2026 08:58:49 +1000
Subject: [PATCH 3/3] Assert concurrent caller errors in cache test

---
 internal/jobs/robots_cache_test.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/internal/jobs/robots_cache_test.go b/internal/jobs/robots_cache_test.go
index 5a6e9ea7..2cc4c42f 100644
--- a/internal/jobs/robots_cache_test.go
+++ b/internal/jobs/robots_cache_test.go
@@ -174,11 +174,14 @@ func TestGetRobotsRules_CollapsesConcurrentMisses(t *testing.T) {
 
 	const fanout = 20
 	var wg sync.WaitGroup
+	errCh := make(chan error, fanout)
 	wg.Add(fanout)
 	for i := 0; i < fanout; i++ {
 		go func() {
 			defer wg.Done()
-			_, _ = jm.GetRobotsRules(ctx, "swarm.com")
+			if _, err := jm.GetRobotsRules(ctx, "swarm.com"); err != nil {
+				errCh <- err
+			}
 		}()
 	}
 
@@ -189,6 +192,10 @@ func TestGetRobotsRules_CollapsesConcurrentMisses(t *testing.T) {
 	}
 	close(gate)
 	wg.Wait()
+	close(errCh)
+	for err := range errCh {
+		t.Fatalf("unexpected error from concurrent caller: %v", err)
+	}
 
 	if c := stub.calls.Load(); c != 1 {
 		t.Fatalf("singleflight should collapse %d concurrent misses to one fetch, got %d", fanout, c)