From 7d657f6aa28e61d726c95f8d9979bdff1623be47 Mon Sep 17 00:00:00 2001 From: Todd Short Date: Thu, 30 Apr 2026 10:14:12 -0400 Subject: [PATCH] fix(e2e): use per-component namespaces in HA and upgrade steps The e2e test infrastructure assumed all OLM components (operator-controller and catalogd) live in the same namespace, but some distributions deploy them in separate namespaces. This caused the HA scenario to fail because ComponentIsReadyToReconcile searched for the catalogd deployment in the operator-controller namespace. detectOLMDeployments now returns both deployments from a single API call, and BeforeSuite populates a componentNamespaces map so each component's operations (deployment lookup, rollout, lease, logs, pod fetch) use the correct namespace. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Todd Short --- test/e2e/steps/ha_steps.go | 4 +-- test/e2e/steps/hooks.go | 57 ++++++++++++++++++++------------- test/e2e/steps/steps.go | 17 ++++++++-- test/e2e/steps/upgrade_steps.go | 31 ++++++++++-------- 4 files changed, 68 insertions(+), 41 deletions(-) diff --git a/test/e2e/steps/ha_steps.go b/test/e2e/steps/ha_steps.go index 285917850..1d5d9844c 100644 --- a/test/e2e/steps/ha_steps.go +++ b/test/e2e/steps/ha_steps.go @@ -31,7 +31,7 @@ func CatalogdLeaderPodIsForceDeleted(ctx context.Context) error { } logger.Info("Force-deleting catalogd leader pod", "pod", leaderPod) - if _, err := k8sClient("delete", "pod", leaderPod, "-n", olmNamespace, + if _, err := k8sClient("delete", "pod", leaderPod, "-n", namespaceForComponent("catalogd"), "--force", "--grace-period=0"); err != nil { return fmt.Errorf("failed to force-delete catalogd leader pod %q: %w", leaderPod, err) } @@ -46,7 +46,7 @@ func NewCatalogdLeaderIsElected(ctx context.Context) error { oldLeader := sc.leaderPods["catalogd"] waitFor(ctx, func() bool { - holder, err := k8sClient("get", "lease", leaseNames["catalogd"], "-n", olmNamespace, + holder, err := k8sClient("get", "lease", leaseNames["catalogd"], "-n", namespaceForComponent("catalogd"), "-o", "jsonpath={.spec.holderIdentity}") if err != nil || holder == "" { return false diff --git a/test/e2e/steps/hooks.go b/test/e2e/steps/hooks.go index b7062ba89..d34ccd31e 100644 --- a/test/e2e/steps/hooks.go +++ b/test/e2e/steps/hooks.go @@ -108,22 +108,33 @@ func RegisterHooks(sc *godog.ScenarioContext) { sc.After(ScenarioCleanup) } -func detectOLMDeployment() (*appsv1.Deployment, error) { +// detectOLMDeployments returns the operator-controller deployment (first) and the catalogd +// deployment (second) found via the app.kubernetes.io/part-of=olm label across all namespaces. +// The catalogd return value may be nil when OLM is not yet installed (upgrade scenarios +// install it in a Background step). +func detectOLMDeployments() (*appsv1.Deployment, *appsv1.Deployment, error) { raw, err := k8sClient("get", "deployments", "-A", "-l", "app.kubernetes.io/part-of=olm", "-o", "jsonpath={.items}") if err != nil { - return nil, err + return nil, nil, err } dl := []appsv1.Deployment{} if err := json.Unmarshal([]byte(raw), &dl); err != nil { - return nil, fmt.Errorf("failed to unmarshal OLM deployments: %v", err) + return nil, nil, fmt.Errorf("failed to unmarshal OLM deployments: %v", err) } - for _, d := range dl { - if d.Name == olmDeploymentName { - return &d, nil + var operatorController, catalogd *appsv1.Deployment + for i := range dl { + switch dl[i].Name { + case olmDeploymentName: + operatorController = &dl[i] + case catalogdDeploymentName: + catalogd = &dl[i] } } - return nil, fmt.Errorf("failed to detect OLM Deployment") + if operatorController == nil { + return nil, nil, fmt.Errorf("failed to detect OLM Deployment") + } + return operatorController, catalogd, nil } func BeforeSuite() { @@ -141,27 +152,29 @@ func BeforeSuite() { featureGates[catalogdHAFeature] = true } - olm, err := detectOLMDeployment() + olm, catalogdDep, err := detectOLMDeployments() if err != nil { logger.Info("OLM deployments not found; skipping feature gate detection (upgrade scenarios will install OLM in Background)") return } olmNamespace = olm.Namespace + componentNamespaces["operator-controller"] = olmNamespace - // Refine CatalogdHA based on actual catalogd replica count now that - // olmNamespace is known. The node-count check above can fire on any - // multi-node cluster even when catalogd runs with only 1 replica. - // Override the gate: HA scenarios require ≥2 catalogd replicas. - // If the deployment is not found (kubectl error), or the replica - // count is empty/non-numeric, the gate keeps whatever the node-count - // check set. - if repOut, err := k8sClient("get", "deployments", "-n", olmNamespace, - "-l", "app.kubernetes.io/name=catalogd", - "-o", "jsonpath={.items[0].spec.replicas}"); err == nil { - if repOut = strings.TrimSpace(repOut); repOut != "" { - if replicas, err := strconv.Atoi(repOut); err == nil { - featureGates[catalogdHAFeature] = replicas >= 2 - } + // Catalogd may be in a different namespace than operator-controller. + catalogdNS := olmNamespace + if catalogdDep != nil { + catalogdNS = catalogdDep.Namespace + } + componentNamespaces["catalogd"] = catalogdNS + + // Refine CatalogdHA based on actual catalogd replica count now that catalogdNS is + // known. The node-count check above can fire on any multi-node cluster even when + // catalogd runs with only 1 replica. Override the gate: HA scenarios require ≥2 + // catalogd replicas. Fall back to whatever the node-count check set when catalogd + // was not found or the replica count is not parseable. + if catalogdDep != nil { + if replicas := catalogdDep.Spec.Replicas; replicas != nil { + featureGates[catalogdHAFeature] = *replicas >= 2 } } diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index a91bfbacb..0a9bf2115 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -50,13 +50,15 @@ import ( ) const ( - olmDeploymentName = "operator-controller-controller-manager" - timeout = 5 * time.Minute - tick = 1 * time.Second + olmDeploymentName = "operator-controller-controller-manager" + catalogdDeploymentName = "catalogd-controller-manager" + timeout = 5 * time.Minute + tick = 1 * time.Second ) var ( olmNamespace = "olmv1-system" + componentNamespaces = map[string]string{} // keyed by component label (e.g. "catalogd"); falls back to olmNamespace kubeconfigPath string k8sCli string deployImageRegistry = sync.OnceValue(func() error { @@ -214,6 +216,15 @@ func init() { } } +// namespaceForComponent returns the namespace for the named OLM component. +// Falls back to olmNamespace when the component has no dedicated entry (e.g. upstream single-namespace deployments). +func namespaceForComponent(component string) string { + if ns, ok := componentNamespaces[component]; ok { + return ns + } + return olmNamespace +} + func k8sClient(args ...string) (string, error) { cmd := exec.Command(k8sCli, args...) logger.V(1).Info("Running", "command", strings.Join(cmd.Args, " ")) diff --git a/test/e2e/steps/upgrade_steps.go b/test/e2e/steps/upgrade_steps.go index 0c82437a2..47aab69d2 100644 --- a/test/e2e/steps/upgrade_steps.go +++ b/test/e2e/steps/upgrade_steps.go @@ -29,7 +29,7 @@ var ( if err != nil { return err } - olm, err := detectOLMDeployment() + olm, _, err := detectOLMDeployments() if err != nil { return err } @@ -82,19 +82,20 @@ func OLMIsUpgraded(ctx context.Context) error { // then checks the leader election lease and stores the leader pod name in the scenario context. func ComponentIsReadyToReconcile(ctx context.Context, component string) error { sc := scenarioCtx(ctx) + ns := namespaceForComponent(component) // Wait for deployment rollout to complete - depName, err := k8sClient("get", "deployments", "-n", olmNamespace, + depName, err := k8sClient("get", "deployments", "-n", ns, "-l", fmt.Sprintf("app.kubernetes.io/name=%s", component), "-o", "jsonpath={.items[0].metadata.name}") if err != nil { - return fmt.Errorf("failed to find deployment for component %s: %w", component, err) + return fmt.Errorf("failed to find deployment for component %s in namespace %s: %w", component, ns, err) } if depName == "" { - return fmt.Errorf("failed to find deployment for component %s: no matching deployments found", component) + return fmt.Errorf("failed to find deployment for component %s in namespace %s: no matching deployments found", component, ns) } if _, err := k8sClient("rollout", "status", fmt.Sprintf("deployment/%s", depName), - "-n", olmNamespace, fmt.Sprintf("--timeout=%s", timeout)); err != nil { + "-n", ns, fmt.Sprintf("--timeout=%s", timeout)); err != nil { return fmt.Errorf("deployment rollout failed for %s: %w", component, err) } @@ -106,7 +107,7 @@ func ComponentIsReadyToReconcile(ctx context.Context, component string) error { // Leader election can take up to LeaseDuration (137s) + RetryPeriod (26s) ≈ 163s in the worst case waitFor(ctx, func() bool { - output, err := k8sClient("get", "lease", leaseName, "-n", olmNamespace, + output, err := k8sClient("get", "lease", leaseName, "-n", ns, "-o", "jsonpath={.spec.holderIdentity}") if err != nil || output == "" { return false @@ -129,9 +130,9 @@ var resourceTypeToComponent = map[string]string{ // reconcileEndingCheck returns a function that checks whether the leader pod's logs // contain a "reconcile ending" entry for the given resource name. -func reconcileEndingCheck(leaderPod, resourceName string) func() bool { +func reconcileEndingCheck(namespace, leaderPod, resourceName string) func() bool { return func() bool { - logs, err := k8sClient("logs", leaderPod, "-n", olmNamespace, "--all-containers=true", "--tail=1000") + logs, err := k8sClient("logs", leaderPod, "-n", namespace, "--all-containers=true", "--tail=1000") if err != nil { return false } @@ -160,13 +161,13 @@ func ClusterExtensionIsReconciled(ctx context.Context) error { } leaderPod := sc.leaderPods[component] - waitFor(ctx, reconcileEndingCheck(leaderPod, resourceName)) + waitFor(ctx, reconcileEndingCheck(namespaceForComponent(component), leaderPod, resourceName)) return nil } // clusterCatalogUnpackedAfterPodCreation returns a check function that verifies the // ClusterCatalog is serving and its lastUnpacked timestamp is after the leader pod's creation. -func clusterCatalogUnpackedAfterPodCreation(resourceName, leaderPod string) func() bool { +func clusterCatalogUnpackedAfterPodCreation(namespace, resourceName, leaderPod string) func() bool { return func() bool { catalogJSON, err := k8sClient("get", "clustercatalog", resourceName, "-o", "json") if err != nil { @@ -190,7 +191,7 @@ func clusterCatalogUnpackedAfterPodCreation(resourceName, leaderPod string) func return false } - podJSON, err := k8sClient("get", "pod", leaderPod, "-n", olmNamespace, "-o", "json") + podJSON, err := k8sClient("get", "pod", leaderPod, "-n", namespace, "-o", "json") if err != nil { return false } @@ -229,8 +230,9 @@ func allResourcesAreReconciled(ctx context.Context, resourceType string) error { return fmt.Errorf("no %s resources found", resourceType) } + ns := namespaceForComponent(component) for _, name := range resourceNames { - waitFor(ctx, reconcileEndingCheck(leaderPod, name)) + waitFor(ctx, reconcileEndingCheck(ns, leaderPod, name)) } return nil @@ -254,11 +256,12 @@ func ScenarioCatalogIsReconciled(ctx context.Context, catalogUserName string) er return fmt.Errorf("leader pod not found for component %s", component) } - waitFor(ctx, reconcileEndingCheck(leaderPod, catalogName)) + ns := namespaceForComponent(component) + waitFor(ctx, reconcileEndingCheck(ns, leaderPod, catalogName)) // Also verify that lastUnpacked is after the leader pod's creation. // This mitigates flakiness caused by https://github.com/operator-framework/operator-controller/issues/1626 - waitFor(ctx, clusterCatalogUnpackedAfterPodCreation(catalogName, leaderPod)) + waitFor(ctx, clusterCatalogUnpackedAfterPodCreation(ns, catalogName, leaderPod)) return nil }