diff --git a/helm/sim/.claude/skills/sim-helm/references/values-model.md b/helm/sim/.claude/skills/sim-helm/references/values-model.md index 8e75492909..801dfd77b7 100644 --- a/helm/sim/.claude/skills/sim-helm/references/values-model.md +++ b/helm/sim/.claude/skills/sim-helm/references/values-model.md @@ -44,11 +44,9 @@ The Sim chart splits configuration across **four** layers. Understanding which l ## Why this layering exists -**Single source of truth per concern.** Secrets live in a Secret. Operational defaults live where users can override them. Chart-computed values live where the chart can authoritatively compute them. +**ESO compatibility.** When `externalSecrets.enabled=true`, the chart-managed Secret is **not rendered** — ESO renders one instead. Anything in Layer 1 must be mapped via `remoteRefs.app.` or it's silently missing. Layers 2–4 are unaffected by ESO. -**ESO compatibility.** When `externalSecrets.enabled=true`, the chart-managed Secret is **not rendered** — ESO renders one instead. Anything in Layer 1 must be mapped via `remoteRefs.app.` or it's silently missing. Layers 2–4 are unaffected by ESO. Putting operational tunables in `envDefaults` instead of `env` means ESO users don't have to map dozens of tunables — just the real secrets. - -**Backwards compatibility.** Layer 2 was added in chart 1.0.0 (formerly all defaults lived in `app.env`). The override-skip logic in the Deployment template means existing users who set values in `app.env` continue to work — those values win over `envDefaults`. +**Override precedence.** Values set in `app.env` (Layer 1 overrides) win over `envDefaults` (Layer 2) — so users who already had operational tunables in `app.env` continue to work. ## Where keys live — the canonical list diff --git a/helm/sim/README.md b/helm/sim/README.md index 80fc5b81f1..6bb45abc8d 100644 --- a/helm/sim/README.md +++ b/helm/sim/README.md @@ -217,7 +217,7 @@ Before installing in production, confirm each of the following: * **Pinned images** — override `image.tag` (or `image.digest`) with an explicit version. Do not rely on the chart's default tag in production. * **Secrets management** — provide secrets via External Secrets Operator (ESO) or pre-created Kubernetes Secrets. Never commit secrets to `values.yaml`. * **TLS / Ingress** — set the `cert-manager.io/cluster-issuer` annotation on the ingress and tune `proxy-body-size` / `proxy-read-timeout` for your workload. See commented examples in `values.yaml`. -* **Network policy egress** — review `networkPolicy.egress.exceptCidrs`. Defaults block cloud metadata endpoints (`169.254.169.254/32`, `169.254.170.2/32`); add your cluster's API server CIDR for stronger isolation. +* **Network policy egress** — review `networkPolicy.egressExceptCidrs`. Defaults block cloud metadata endpoints (`169.254.169.254/32`, `169.254.170.2/32`); add your cluster's API server CIDR for stronger isolation. Custom egress rules go in `networkPolicy.egress` (a list). * **Namespace hardening** — label the install namespace with Pod Security Standards `restricted` enforcement (`pod-security.kubernetes.io/enforce=restricted`). * **Env validation** — keys under `app.env`, `realtime.env`, and `copilot.env` are passed through to the application and validated at startup. The JSON Schema intentionally does not enforce `additionalProperties: false` (would break custom user envs), so typos like `OPENA_API_KEY` (instead of `OPENAI_API_KEY`) surface as missing-key errors at runtime, not at `helm install` time. Review your env block carefully. * **Set public URLs** — `app.env.NEXT_PUBLIC_APP_URL` and `app.env.BETTER_AUTH_URL` must match your public origin (e.g. `https://sim.example.com`). Leaving them as `localhost` breaks sign-in. diff --git a/helm/sim/templates/NOTES.txt b/helm/sim/templates/NOTES.txt index e563117500..27be4b0b2a 100644 --- a/helm/sim/templates/NOTES.txt +++ b/helm/sim/templates/NOTES.txt @@ -81,7 +81,16 @@ Your release is named {{ .Release.Name }} in namespace {{ .Release.Namespace }}. # Upgrade after changing values helm upgrade {{ .Release.Name }} ./helm/sim --namespace {{ .Release.Namespace }} -f your-values.yaml -5. Where to go next: +5. Upgrade notes (read before upgrading from a chart version released before this one): + + * externalSecrets.apiVersion default is "v1beta1" (was "v1"). v1beta1 is + supported by every ESO release from v0.7+ through current. If you're on + ESO v0.17+ and want the graduated v1 API, set externalSecrets.apiVersion: "v1". + * networkPolicy.egress remains a list of custom egress rules (unchanged). + Cloud-metadata CIDR blocking is now configured via networkPolicy.egressExceptCidrs + (defaults to AWS/GCP/Azure IMDS + ECS task metadata). + +6. Where to go next: * Production checklist: helm/sim/README.md (search "Production checklist") * Troubleshooting: helm/sim/README.md (search "Troubleshooting") diff --git a/helm/sim/templates/networkpolicy.yaml b/helm/sim/templates/networkpolicy.yaml index 4a19ae8937..a6db889d74 100644 --- a/helm/sim/templates/networkpolicy.yaml +++ b/helm/sim/templates/networkpolicy.yaml @@ -107,14 +107,14 @@ spec: - ipBlock: cidr: 0.0.0.0/0 except: - {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egress.exceptCidrs) }} + {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egressExceptCidrs) }} - {{ . | quote }} {{- end }} ports: - protocol: TCP port: 443 # Allow custom egress rules - {{- with .Values.networkPolicy.egress.extraRules }} + {{- with .Values.networkPolicy.egress }} {{- toYaml . | nindent 2 }} {{- end }} @@ -189,14 +189,14 @@ spec: - ipBlock: cidr: 0.0.0.0/0 except: - {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egress.exceptCidrs) }} + {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egressExceptCidrs) }} - {{ . | quote }} {{- end }} ports: - protocol: TCP port: 443 # Allow custom egress rules - {{- with .Values.networkPolicy.egress.extraRules }} + {{- with .Values.networkPolicy.egress }} {{- toYaml . | nindent 2 }} {{- end }} {{- end }} @@ -296,11 +296,96 @@ spec: - ipBlock: cidr: 0.0.0.0/0 except: - {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egress.exceptCidrs) }} + {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egressExceptCidrs) }} - {{ . | quote }} {{- end }} ports: - protocol: TCP port: 443 {{- end }} + +{{- if .Values.telemetry.enabled }} +--- +# Network Policy for OpenTelemetry Collector +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "sim.fullname" . }}-otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: telemetry +spec: + podSelector: + matchLabels: + {{- include "sim.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: telemetry + policyTypes: + - Ingress + - Egress + ingress: + # OTLP from app + - from: + - podSelector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: 4317 + - protocol: TCP + port: 4318 + # OTLP from realtime + {{- if .Values.realtime.enabled }} + - from: + - podSelector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: 4317 + - protocol: TCP + port: 4318 + {{- end }} + # OTLP from copilot + {{- if .Values.copilot.enabled }} + - from: + - podSelector: + matchLabels: + {{- include "sim.selectorLabels" . | nindent 10 }} + app.kubernetes.io/component: copilot + ports: + - protocol: TCP + port: 4317 + - protocol: TCP + port: 4318 + {{- end }} + egress: + # DNS + - to: [] + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + # HTTPS for forwarding to external observability backends (Datadog, Honeycomb, etc.) + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + {{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egressExceptCidrs) }} + - {{ . | quote }} + {{- end }} + ports: + - protocol: TCP + port: 443 +{{- end }} + +{{- /* + Copilot + copilot-postgresql intentionally do NOT ship dedicated NetworkPolicies. + Copilot requires REDIS_URL (external Redis on a non-443 port), and the chart + cannot know the user's Redis host/port at render time — a default egress rule + would silently block Redis on most installs. Users running networkPolicy.enabled=true + with copilot enabled should add their own NPs (or extend networkPolicy.egress + with the appropriate egress rules). +*/}} {{- end }} \ No newline at end of file diff --git a/helm/sim/templates/statefulset-copilot-postgres.yaml b/helm/sim/templates/statefulset-copilot-postgres.yaml index 68275fb3e9..91dd5bad19 100644 --- a/helm/sim/templates/statefulset-copilot-postgres.yaml +++ b/helm/sim/templates/statefulset-copilot-postgres.yaml @@ -66,7 +66,11 @@ metadata: {{- include "sim.labels" . | nindent 4 }} app.kubernetes.io/component: copilot-postgresql spec: - serviceName: {{ include "sim.fullname" . }}-copilot-postgresql-headless + # Must remain {{ include "sim.fullname" . }}-copilot-postgresql (not the + # -headless name) — spec.serviceName is immutable on a StatefulSet, and + # the prior chart shipped with this value. Same rationale as the main + # postgresql STS; see statefulset-postgresql.yaml for details. + serviceName: {{ include "sim.fullname" . }}-copilot-postgresql replicas: 1 podManagementPolicy: OrderedReady updateStrategy: @@ -111,6 +115,10 @@ spec: envFrom: - secretRef: name: {{ include "sim.fullname" . }}-copilot-postgresql-secret + {{- if .Values.copilot.postgresql.startupProbe }} + startupProbe: + {{- toYaml .Values.copilot.postgresql.startupProbe | nindent 12 }} + {{- end }} {{- if .Values.copilot.postgresql.livenessProbe }} livenessProbe: {{- toYaml .Values.copilot.postgresql.livenessProbe | nindent 12 }} diff --git a/helm/sim/templates/statefulset-postgresql.yaml b/helm/sim/templates/statefulset-postgresql.yaml index e2a9bf402a..50c78b8a03 100644 --- a/helm/sim/templates/statefulset-postgresql.yaml +++ b/helm/sim/templates/statefulset-postgresql.yaml @@ -90,7 +90,12 @@ metadata: labels: {{- include "sim.postgresql.labels" . | nindent 4 }} spec: - serviceName: {{ include "sim.fullname" . }}-postgresql-headless + # Must remain {{ include "sim.fullname" . }}-postgresql (not the -headless + # name) — spec.serviceName is immutable on a StatefulSet, and the prior + # chart shipped with this value. Changing it would break `helm upgrade` for + # every existing install with `Forbidden: updates to statefulset spec ...`. + # The headless Service in services.yaml is added alongside, not as a swap. + serviceName: {{ include "sim.fullname" . }}-postgresql replicas: 1 minReadySeconds: 10 podManagementPolicy: OrderedReady @@ -135,6 +140,10 @@ spec: name: {{ include "sim.fullname" . }}-postgresql-env - secretRef: name: {{ include "sim.postgresqlSecretName" . }} + {{- if .Values.postgresql.startupProbe }} + startupProbe: + {{- toYaml .Values.postgresql.startupProbe | nindent 12 }} + {{- end }} {{- if .Values.postgresql.livenessProbe }} livenessProbe: {{- toYaml .Values.postgresql.livenessProbe | nindent 12 }} diff --git a/helm/sim/tests/networkpolicy_test.yaml b/helm/sim/tests/networkpolicy_test.yaml index da52d46f06..d6e0b260ec 100644 --- a/helm/sim/tests/networkpolicy_test.yaml +++ b/helm/sim/tests/networkpolicy_test.yaml @@ -128,10 +128,23 @@ tests: - protocol: TCP port: 3000 - - it: egress.extraRules are appended to both app and realtime NetworkPolicies + - it: telemetry collector NetworkPolicy renders when telemetry.enabled=true set: <<: *defaults - networkPolicy.egress.extraRules: + telemetry.enabled: true + documentIndex: 3 + asserts: + - equal: + path: kind + value: NetworkPolicy + - equal: + path: metadata.name + value: t-sim-otel-collector + + - it: networkPolicy.egress (custom rules) are appended to the app NetworkPolicy + set: + <<: *defaults + networkPolicy.egress: - to: [] ports: - protocol: TCP @@ -145,3 +158,21 @@ tests: ports: - protocol: TCP port: 5432 + + - it: networkPolicy.egress (custom rules) are appended to the realtime NetworkPolicy + set: + <<: *defaults + networkPolicy.egress: + - to: [] + ports: + - protocol: TCP + port: 5432 + documentIndex: 1 + asserts: + - contains: + path: spec.egress + content: + to: [] + ports: + - protocol: TCP + port: 5432 diff --git a/helm/sim/values.yaml b/helm/sim/values.yaml index a913c4b836..e253f1a0af 100644 --- a/helm/sim/values.yaml +++ b/helm/sim/values.yaml @@ -622,12 +622,22 @@ postgresql: targetPort: 5432 # Health checks + # startupProbe shields liveness from slow first-boot scenarios (pgvector + # extension init, WAL replay after a crash on a large data dir). Gives + # postgres up to 150s (30 * 5s) to become ready before liveness takes over. + startupProbe: + exec: + command: ["pg_isready", "-U", "postgres", "-d", "sim"] + periodSeconds: 5 + failureThreshold: 30 + timeoutSeconds: 5 + livenessProbe: exec: command: ["pg_isready", "-U", "postgres", "-d", "sim"] initialDelaySeconds: 10 periodSeconds: 5 - + readinessProbe: exec: command: ["pg_isready", "-U", "postgres", "-d", "sim"] @@ -954,7 +964,7 @@ monitoring: # to each other and to required external services (DNS, HTTPS) while blocking # everything else. The egress block additionally blacklists cloud metadata # endpoints (169.254.169.254/32, 169.254.170.2/32) by default — extend -# egress.exceptCidrs with your cluster's API server CIDR for tighter isolation. +# egressExceptCidrs with your cluster's API server CIDR for tighter isolation. # Your CNI must support NetworkPolicy (Calico, Cilium, GKE Dataplane V2, etc.). networkPolicy: enabled: false @@ -973,16 +983,18 @@ networkPolicy: # Custom ingress rules appended to the policy ingress: [] - # Egress configuration - egress: - # CIDRs excluded from broad HTTPS (443) egress. - # Defaults block AWS/GCP/Azure IMDS (169.254.169.254/32) and ECS task metadata - # (169.254.170.2/32). Add your cluster's API server CIDR for stronger isolation. - exceptCidrs: - - "169.254.169.254/32" - - "169.254.170.2/32" - # Custom egress rules appended to the policy - extraRules: [] + # Custom egress rules appended to the policy. + # Kept as a top-level list (not a map) for backward compatibility with the + # pre-1.0 chart that shipped `networkPolicy.egress: []`. Existing values + # files continue to work without changes. + egress: [] + + # CIDRs excluded from broad HTTPS (443) egress. + # Defaults block AWS/GCP/Azure IMDS (169.254.169.254/32) and ECS task metadata + # (169.254.170.2/32). Add your cluster's API server CIDR for stronger isolation. + egressExceptCidrs: + - "169.254.169.254/32" + - "169.254.170.2/32" # Shared storage for enterprise workflows requiring data sharing between pods sharedStorage: @@ -1438,6 +1450,16 @@ copilot: targetPort: 5432 # Health checks + # startupProbe shields liveness from slow first-boot scenarios (pgvector + # extension init, WAL replay after a crash). Gives postgres up to 150s + # (30 * 5s) to become ready before liveness takes over. + startupProbe: + exec: + command: ["pg_isready", "-U", "copilot", "-d", "copilot"] + periodSeconds: 5 + failureThreshold: 30 + timeoutSeconds: 5 + livenessProbe: exec: command: ["pg_isready", "-U", "copilot", "-d", "copilot"] @@ -1445,7 +1467,7 @@ copilot: periodSeconds: 5 timeoutSeconds: 5 failureThreshold: 10 - + readinessProbe: exec: command: ["pg_isready", "-U", "copilot", "-d", "copilot"]