From 940c4c6f99e977a004e6441832b5ebfecf0ff076 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Thu, 7 May 2026 17:21:45 +0200 Subject: [PATCH] fix: Add orderly shutdown steps to all kuttl tests to prevent namespace deletion timeouts During namespace deletion, ZooKeeper and Kafka are terminated simultaneously. Kafka's controlled-shutdown retries ZK connections indefinitely, keeping the process alive for the full grace period and blocking namespace deletion past kuttl's 300s timeout. For ZK-mode tests: scale brokers to 0 via the CRD so the operator performs an orderly shutdown before ZK is removed. For KRaft-mode tests: scale brokers to 0, delete the KafkaCluster CR to stop reconciliation, then force-delete controller pods. Controllers cannot be scaled via the CRD due to a "no Kraft controllers found to build ConfigMap" error. All tests also set gracefulShutdownTimeout: 60s to bound the worst-case wait. Validated with a full nightly suite run (26/26 PASS). Co-Authored-By: Claude Opus 4.6 --- .../20-install-kafka.yaml.j2 | 1 + .../cluster-operation/90-shutdown-kafka.yaml | 18 +++++++++++++ .../configuration/10-install-kafka.yaml.j2 | 2 ++ .../configuration/90-shutdown-kafka.yaml | 25 +++++++++++++++++++ .../delete-rolegroup/90-shutdown-kafka.yaml | 18 +++++++++++++ .../kuttl/kerberos/90-shutdown-kafka.yaml | 18 +++++++++++++ .../kuttl/logging/04-install-kafka.yaml.j2 | 2 ++ .../kuttl/logging/90-shutdown-kafka.yaml | 18 +++++++++++++ .../kuttl/opa/30-install-kafka.yaml.j2 | 1 + .../kuttl/opa/90-shutdown-kafka.yaml | 18 +++++++++++++ .../operations-kraft/20-install-kafka.yaml.j2 | 2 ++ .../operations-kraft/90-shutdown-kafka.yaml | 25 +++++++++++++++++++ .../smoke-kraft/30-install-kafka.yaml.j2 | 2 ++ .../kuttl/smoke-kraft/90-shutdown-kafka.yaml | 25 +++++++++++++++++++ .../kuttl/smoke/30-install-kafka.yaml.j2 | 2 ++ .../kuttl/smoke/90-shutdown-kafka.yaml | 18 +++++++++++++ .../kuttl/tls/40-install-kafka.yaml.j2 | 1 + .../kuttl/tls/90-shutdown-kafka.yaml | 18 +++++++++++++ .../kuttl/upgrade/90-shutdown-kafka.yaml | 25 +++++++++++++++++++ 19 files changed, 239 insertions(+) create mode 100644 tests/templates/kuttl/cluster-operation/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/configuration/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/delete-rolegroup/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/kerberos/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/logging/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/opa/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/operations-kraft/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/smoke-kraft/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/smoke/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/tls/90-shutdown-kafka.yaml create mode 100644 tests/templates/kuttl/upgrade/90-shutdown-kafka.yaml diff --git a/tests/templates/kuttl/cluster-operation/20-install-kafka.yaml.j2 b/tests/templates/kuttl/cluster-operation/20-install-kafka.yaml.j2 index 6d391b65..2de65be8 100644 --- a/tests/templates/kuttl/cluster-operation/20-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/20-install-kafka.yaml.j2 @@ -23,6 +23,7 @@ spec: zookeeperConfigMapName: test-zk brokers: config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/cluster-operation/90-shutdown-kafka.yaml b/tests/templates/kuttl/cluster-operation/90-shutdown-kafka.yaml new file mode 100644 index 00000000..279f8923 --- /dev/null +++ b/tests/templates/kuttl/cluster-operation/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/configuration/10-install-kafka.yaml.j2 b/tests/templates/kuttl/configuration/10-install-kafka.yaml.j2 index af9f97e8..17ecec86 100644 --- a/tests/templates/kuttl/configuration/10-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/configuration/10-install-kafka.yaml.j2 @@ -23,6 +23,7 @@ spec: {% endif %} controllers: config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} resources: @@ -57,6 +58,7 @@ spec: replicas: 1 brokers: config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} resources: diff --git a/tests/templates/kuttl/configuration/90-shutdown-kafka.yaml b/tests/templates/kuttl/configuration/90-shutdown-kafka.yaml new file mode 100644 index 00000000..79c4bec2 --- /dev/null +++ b/tests/templates/kuttl/configuration/90-shutdown-kafka.yaml @@ -0,0 +1,25 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Brokers are scaled via the CRD so the operator performs an orderly shutdown. +# Once brokers are gone, we delete the KafkaCluster CR to stop the operator +# reconciling, then force-delete any remaining controller pods. We cannot scale +# controllers via the CRD because the operator errors with "no Kraft controllers +# found to build ConfigMap", and scaling the StatefulSet directly is immediately +# reversed by the operator's reconciliation loop. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=300s + - script: | + kubectl delete kafkacluster test-kafka -n $NAMESPACE --wait=false 2>/dev/null || true + - script: | + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --timeout=120s 2>/dev/null || true diff --git a/tests/templates/kuttl/delete-rolegroup/90-shutdown-kafka.yaml b/tests/templates/kuttl/delete-rolegroup/90-shutdown-kafka.yaml new file mode 100644 index 00000000..97fa25a5 --- /dev/null +++ b/tests/templates/kuttl/delete-rolegroup/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0},"secondary":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/kerberos/90-shutdown-kafka.yaml b/tests/templates/kuttl/kerberos/90-shutdown-kafka.yaml new file mode 100644 index 00000000..279f8923 --- /dev/null +++ b/tests/templates/kuttl/kerberos/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/logging/04-install-kafka.yaml.j2 b/tests/templates/kuttl/logging/04-install-kafka.yaml.j2 index 42588924..94f89b0c 100644 --- a/tests/templates/kuttl/logging/04-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/logging/04-install-kafka.yaml.j2 @@ -52,6 +52,8 @@ spec: vectorAggregatorConfigMapName: kafka-vector-aggregator-discovery zookeeperConfigMapName: test-kafka-znode brokers: + config: + gracefulShutdownTimeout: 60s roleGroups: automatic-log-config: replicas: 1 diff --git a/tests/templates/kuttl/logging/90-shutdown-kafka.yaml b/tests/templates/kuttl/logging/90-shutdown-kafka.yaml new file mode 100644 index 00000000..92a6c477 --- /dev/null +++ b/tests/templates/kuttl/logging/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"automatic-log-config":{"replicas":0},"custom-log-config":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/opa/30-install-kafka.yaml.j2 b/tests/templates/kuttl/opa/30-install-kafka.yaml.j2 index ad31671d..b1614ee3 100644 --- a/tests/templates/kuttl/opa/30-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/opa/30-install-kafka.yaml.j2 @@ -42,6 +42,7 @@ commands: config: logging: enableVectorAgent: true + gracefulShutdownTimeout: 60s roleGroups: default: replicas: 3 diff --git a/tests/templates/kuttl/opa/90-shutdown-kafka.yaml b/tests/templates/kuttl/opa/90-shutdown-kafka.yaml new file mode 100644 index 00000000..279f8923 --- /dev/null +++ b/tests/templates/kuttl/opa/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/operations-kraft/20-install-kafka.yaml.j2 b/tests/templates/kuttl/operations-kraft/20-install-kafka.yaml.j2 index 704cacaa..1959cd10 100644 --- a/tests/templates/kuttl/operations-kraft/20-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/operations-kraft/20-install-kafka.yaml.j2 @@ -23,6 +23,7 @@ spec: {% endif %} controllers: config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -30,6 +31,7 @@ spec: replicas: 3 brokers: config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/operations-kraft/90-shutdown-kafka.yaml b/tests/templates/kuttl/operations-kraft/90-shutdown-kafka.yaml new file mode 100644 index 00000000..79c4bec2 --- /dev/null +++ b/tests/templates/kuttl/operations-kraft/90-shutdown-kafka.yaml @@ -0,0 +1,25 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Brokers are scaled via the CRD so the operator performs an orderly shutdown. +# Once brokers are gone, we delete the KafkaCluster CR to stop the operator +# reconciling, then force-delete any remaining controller pods. We cannot scale +# controllers via the CRD because the operator errors with "no Kraft controllers +# found to build ConfigMap", and scaling the StatefulSet directly is immediately +# reversed by the operator's reconciliation loop. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=300s + - script: | + kubectl delete kafkacluster test-kafka -n $NAMESPACE --wait=false 2>/dev/null || true + - script: | + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --timeout=120s 2>/dev/null || true diff --git a/tests/templates/kuttl/smoke-kraft/30-install-kafka.yaml.j2 b/tests/templates/kuttl/smoke-kraft/30-install-kafka.yaml.j2 index 95d85da6..a996b062 100644 --- a/tests/templates/kuttl/smoke-kraft/30-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/smoke-kraft/30-install-kafka.yaml.j2 @@ -80,6 +80,7 @@ spec: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: true requestedSecretLifetime: 7d @@ -135,6 +136,7 @@ spec: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: true requestedSecretLifetime: 7d diff --git a/tests/templates/kuttl/smoke-kraft/90-shutdown-kafka.yaml b/tests/templates/kuttl/smoke-kraft/90-shutdown-kafka.yaml new file mode 100644 index 00000000..fe0a4a4d --- /dev/null +++ b/tests/templates/kuttl/smoke-kraft/90-shutdown-kafka.yaml @@ -0,0 +1,25 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Brokers are scaled via the CRD so the operator performs an orderly shutdown. +# Once brokers are gone, we delete the KafkaCluster CR to stop the operator +# reconciling, then force-delete any remaining controller pods. We cannot scale +# controllers via the CRD because the operator errors with "no Kraft controllers +# found to build ConfigMap", and scaling the StatefulSet directly is immediately +# reversed by the operator's reconciliation loop. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0},"automatic-log-config":{"replicas":0},"custom-log-config":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=300s + - script: | + kubectl delete kafkacluster test-kafka -n $NAMESPACE --wait=false 2>/dev/null || true + - script: | + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --timeout=120s 2>/dev/null || true diff --git a/tests/templates/kuttl/smoke/30-install-kafka.yaml.j2 b/tests/templates/kuttl/smoke/30-install-kafka.yaml.j2 index 4f3b95a0..8055a426 100644 --- a/tests/templates/kuttl/smoke/30-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-install-kafka.yaml.j2 @@ -28,6 +28,8 @@ spec: {% endif %} zookeeperConfigMapName: test-zk brokers: + config: + gracefulShutdownTimeout: 60s configOverrides: broker.properties: compression.type: uncompressed # overridden by role group below diff --git a/tests/templates/kuttl/smoke/90-shutdown-kafka.yaml b/tests/templates/kuttl/smoke/90-shutdown-kafka.yaml new file mode 100644 index 00000000..279f8923 --- /dev/null +++ b/tests/templates/kuttl/smoke/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/tls/40-install-kafka.yaml.j2 b/tests/templates/kuttl/tls/40-install-kafka.yaml.j2 index ba0278a4..f5449fb0 100644 --- a/tests/templates/kuttl/tls/40-install-kafka.yaml.j2 +++ b/tests/templates/kuttl/tls/40-install-kafka.yaml.j2 @@ -61,6 +61,7 @@ spec: zookeeperConfigMapName: test-kafka-znode brokers: config: + gracefulShutdownTimeout: 60s logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/tls/90-shutdown-kafka.yaml b/tests/templates/kuttl/tls/90-shutdown-kafka.yaml new file mode 100644 index 00000000..279f8923 --- /dev/null +++ b/tests/templates/kuttl/tls/90-shutdown-kafka.yaml @@ -0,0 +1,18 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Without this, ZooKeeper and Kafka are terminated simultaneously during +# namespace deletion. Kafka's controlled-shutdown retries ZK connections +# indefinitely, keeping the process alive for the full grace period +# and blocking namespace deletion well past kuttl's 300s timeout. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka -n $NAMESPACE --timeout=300s diff --git a/tests/templates/kuttl/upgrade/90-shutdown-kafka.yaml b/tests/templates/kuttl/upgrade/90-shutdown-kafka.yaml new file mode 100644 index 00000000..79c4bec2 --- /dev/null +++ b/tests/templates/kuttl/upgrade/90-shutdown-kafka.yaml @@ -0,0 +1,25 @@ +--- +# Scale Kafka down before kuttl deletes the namespace. +# Brokers are scaled via the CRD so the operator performs an orderly shutdown. +# Once brokers are gone, we delete the KafkaCluster CR to stop the operator +# reconciling, then force-delete any remaining controller pods. We cannot scale +# controllers via the CRD because the operator errors with "no Kraft controllers +# found to build ConfigMap", and scaling the StatefulSet directly is immediately +# reversed by the operator's reconciliation loop. +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 600 +commands: + - script: | + kubectl patch kafkacluster test-kafka -n $NAMESPACE --type merge -p '{"spec":{"brokers":{"roleGroups":{"default":{"replicas":0}}}}}' + - script: | + if kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=120s 2>/dev/null; then + exit 0 + fi + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=broker -n $NAMESPACE --timeout=300s + - script: | + kubectl delete kafkacluster test-kafka -n $NAMESPACE --wait=false 2>/dev/null || true + - script: | + kubectl delete pods -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + kubectl wait --for=delete pod -l app.kubernetes.io/instance=test-kafka,app.kubernetes.io/component=controller -n $NAMESPACE --timeout=120s 2>/dev/null || true