From 628fa2e6da12430d0c77820501b91da23b7fed48 Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Mon, 29 Jun 2026 22:24:22 -0700 Subject: [PATCH 01/12] feat: add pre-upgrade database migration job support Introduces an opt-in Kubernetes Job that runs Liquibase schema migrations before Gateway pods roll out during an upgrade. Gateway pods are blocked until the job succeeds, preventing pods from starting against an unmigrated schema. Key behaviors: - Works with Gateway 11.2.2 and later which provides four schema update modes in entrypoint.sh: default, skip, liquibase-only, liquibase-only-with-unlock - Migration job enabled via spec.app.management.database.migrationJob.enabled - clearLocks field releases stale Liquibase locks before migrating - Gateway pods automatically use skip mode when migration job is enabled - Job is auto-replaced when spec changes (image, jdbcUrl, clearLocks) - Failed jobs block the deployment and require manual deletion to retry - jdbcUrl override only applies in diskless mode; node.properties wins in non-diskless mode, consistent with Helm behavior --- Makefile | 2 +- api/v1/gateway_types.go | 20 ++ api/v1/zz_generated.deepcopy.go | 24 ++- .../bases/security.brcmlabs.com_gateways.yaml | 20 ++ internal/controller/gateway/controller.go | 5 +- pkg/gateway/configmap.go | 20 ++ pkg/gateway/migration_job.go | 171 ++++++++++++++++++ pkg/gateway/reconcile/migration_job.go | 126 +++++++++++++ 8 files changed, 385 insertions(+), 3 deletions(-) create mode 100644 pkg/gateway/migration_job.go create mode 100644 pkg/gateway/reconcile/migration_job.go diff --git a/Makefile b/Makefile index dd0246f4..7a325314 100644 --- a/Makefile +++ b/Makefile @@ -218,7 +218,7 @@ build: manifests generate fmt vet ## Build manager binary. .PHONY: run run: manifests generate fmt vet ## Run a controller from your host. - go run ./cmd/main.go --zap-log-level=5 + go run ./cmd/main.go --zap-log-level=5 --leader-elect=false .PHONY: docker-build docker-build: dockerfile #test ## Build docker image with the manager. diff --git a/api/v1/gateway_types.go b/api/v1/gateway_types.go index 348cb26b..ac19ea24 100644 --- a/api/v1/gateway_types.go +++ b/api/v1/gateway_types.go @@ -1,3 +1,4 @@ +// Copyright (c) 2026 Broadcom Inc. and its subsidiaries. All Rights Reserved. /* Copyright 2021. @@ -688,6 +689,25 @@ type Database struct { Password string `json:"password,omitempty"` // LiquibaseLogLevel LiquibaseLogLevel LiquibaseLogLevel `json:"liquibaseLogLevel,omitempty"` + // MigrationJob for pre-upgrade schema updates + MigrationJob MigrationJob `json:"migrationJob,omitempty"` +} + +// MigrationJob configures the pre-upgrade database migration job +type MigrationJob struct { + // Enabled or disabled + Enabled bool `json:"enabled,omitempty"` + // JDBCUrl overrides the main database.jdbcUrl for the migration job (e.g. to bypass a proxy). + // Only applies in diskless mode (disklessConfig.disabled: false, the default). + // In non-diskless mode the entrypoint reads the JDBC URL from the mounted node.properties + // file and this field has no effect — node.properties always wins, consistent with Helm behavior. + JDBCUrl string `json:"jdbcUrl,omitempty"` + // ClearLocks forces release of any stuck Liquibase locks before applying + // schema updates. Use with caution: forcefully releasing a lock while + // another process is actively updating the schema can corrupt the database. + ClearLocks bool `json:"clearLocks,omitempty"` + // ActiveDeadlineSeconds is the max duration the job is allowed to run + ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"` } // Restman is a Gateway Management interface that can be automatically provisioned. diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index c079c733..23844085 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -1,3 +1,4 @@ +// Copyright (c) 2026 Broadcom Inc. and its subsidiaries. All Rights Reserved. //go:build !ignore_autogenerated /* @@ -433,6 +434,7 @@ func (in *CustomListenPort) DeepCopy() *CustomListenPort { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Database) DeepCopyInto(out *Database) { *out = *in + in.MigrationJob.DeepCopyInto(&out.MigrationJob) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Database. @@ -987,7 +989,7 @@ func (in *Management) DeepCopyInto(out *Management) { *out = *in in.DisklessConfig.DeepCopyInto(&out.DisklessConfig) out.Cluster = in.Cluster - out.Database = in.Database + in.Database.DeepCopyInto(&out.Database) out.Restman = in.Restman in.Graphman.DeepCopyInto(&out.Graphman) in.Service.DeepCopyInto(&out.Service) @@ -1003,6 +1005,26 @@ func (in *Management) DeepCopy() *Management { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MigrationJob) DeepCopyInto(out *MigrationJob) { + *out = *in + if in.ActiveDeadlineSeconds != nil { + in, out := &in.ActiveDeadlineSeconds, &out.ActiveDeadlineSeconds + *out = new(int64) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationJob. +func (in *MigrationJob) DeepCopy() *MigrationJob { + if in == nil { + return nil + } + out := new(MigrationJob) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Notification) DeepCopyInto(out *Notification) { *out = *in diff --git a/config/crd/bases/security.brcmlabs.com_gateways.yaml b/config/crd/bases/security.brcmlabs.com_gateways.yaml index ae0fba5f..a3dad749 100644 --- a/config/crd/bases/security.brcmlabs.com_gateways.yaml +++ b/config/crd/bases/security.brcmlabs.com_gateways.yaml @@ -3534,6 +3534,26 @@ spec: liquibaseLogLevel: description: LiquibaseLogLevel type: string + migrationJob: + description: MigrationJob for pre-upgrade schema updates + properties: + activeDeadlineSeconds: + description: ActiveDeadlineSeconds is the max duration + the job is allowed to run + format: int64 + type: integer + clearLocks: + description: ClearLocks forces release of any stuck + Liquibase locks before applying... + type: boolean + enabled: + description: Enabled or disabled + type: boolean + jdbcUrl: + description: JDBCUrl for the migration job (useful + for bypassing proxies) + type: string + type: object password: description: Password MySQL - can be set in management.secretName type: string diff --git a/internal/controller/gateway/controller.go b/internal/controller/gateway/controller.go index 0f3a0053..a4748aac 100644 --- a/internal/controller/gateway/controller.go +++ b/internal/controller/gateway/controller.go @@ -46,6 +46,7 @@ import ( "go.opentelemetry.io/otel/metric" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" policyv1 "k8s.io/api/policy/v1" @@ -100,6 +101,7 @@ func (r *GatewayReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct {reconcile.PodDisruptionBudget, "podDisruptionBudget"}, {reconcile.GatewayStatus, "gatewayStatus"}, {reconcile.ConfigMaps, "configMaps"}, + {reconcile.GatewayMigrationJob, "migration job"}, {reconcile.Deployment, "deployment"}, {reconcile.ManagementPod, "management pod"}, {reconcile.HandleEphemeralRestarts, "ephemeral restart detection"}, @@ -163,7 +165,8 @@ func (r *GatewayReconciler) SetupWithManager(mgr ctrl.Manager) error { Owns(&networkingv1.Ingress{}). Owns(&appsv1.Deployment{}). Owns(&policyv1.PodDisruptionBudget{}). - Owns(&autoscalingv2.HorizontalPodAutoscaler{}) + Owns(&autoscalingv2.HorizontalPodAutoscaler{}). + Owns(&batchv1.Job{}) repo := &metav1.PartialObjectMetadata{} repo.SetGroupVersionKind(schema.GroupVersionKind{ diff --git a/pkg/gateway/configmap.go b/pkg/gateway/configmap.go index a29145f1..f5e46701 100644 --- a/pkg/gateway/configmap.go +++ b/pkg/gateway/configmap.go @@ -82,6 +82,26 @@ func NewConfigMap(gw *securityv1.Gateway, name string, opts ...ConfigMapOpts) *c opt = opts[0] } javaArgs := strings.Join(gw.Spec.App.Java.ExtraArgs, " ") + + // When the database migration job is enabled, automatically inject skip mode for + // Gateway pods so they bypass Liquibase entirely — the job handles schema updates. + // Only injected if the user has not explicitly set gateway.db.schema-update.mode. + if gw.Spec.App.Management.Database.MigrationJob.Enabled { + hasSchemaMode := false + for _, arg := range gw.Spec.App.Java.ExtraArgs { + if strings.Contains(arg, "gateway.db.schema-update.mode") { + hasSchemaMode = true + break + } + } + if !hasSchemaMode { + if javaArgs != "" { + javaArgs += " " + } + javaArgs += "-Dgateway.db.schema-update.mode=skip" + } + } + data := make(map[string]string) jvmHeap := setJVMHeapSize(gw, "", gw.Spec.App.Java.JVMHeap.Percentage) dataCheckSum := "" diff --git a/pkg/gateway/migration_job.go b/pkg/gateway/migration_job.go new file mode 100644 index 00000000..fba65549 --- /dev/null +++ b/pkg/gateway/migration_job.go @@ -0,0 +1,171 @@ +/* +* Copyright (c) 2026 Broadcom. All rights reserved. +* The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. +* All trademarks, trade names, service marks, and logos referenced +* herein belong to their respective companies. +* +* This software and all information contained therein is confidential +* and proprietary and shall not be duplicated, used, disclosed or +* disseminated in any way except as authorized by the applicable +* license agreement, without the express written permission of Broadcom. +* All authorized reproductions must be marked with this language. +* +* EXCEPT AS SET FORTH IN THE APPLICABLE LICENSE AGREEMENT, TO THE +* EXTENT PERMITTED BY APPLICABLE LAW OR AS AGREED BY BROADCOM IN ITS +* APPLICABLE LICENSE AGREEMENT, BROADCOM PROVIDES THIS DOCUMENTATION +* "AS IS" WITHOUT WARRANTY OF ANY KIND, INCLUDING WITHOUT LIMITATION, +* ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +* PURPOSE, OR. NONINFRINGEMENT. IN NO EVENT WILL BROADCOM BE LIABLE TO +* THE END USER OR ANY THIRD PARTY FOR ANY LOSS OR DAMAGE, DIRECT OR +* INDIRECT, FROM THE USE OF THIS DOCUMENTATION, INCLUDING WITHOUT LIMITATION, +* LOST PROFITS, LOST INVESTMENT, BUSINESS INTERRUPTION, GOODWILL, OR +* LOST DATA, EVEN IF BROADCOM IS EXPRESSLY ADVISED IN ADVANCE OF THE +* POSSIBILITY OF SUCH LOSS OR DAMAGE. +* +* AI assistance has been used to generate some or all contents of this file. That includes, but is not limited to, new code, modifying existing code, stylistic edits. + */ +package gateway + +import ( + "strings" + + securityv1 "github.com/caapim/layer7-operator/api/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { + gatewaySecretName := gw.Name + if gw.Spec.App.Management.DisklessConfig.Disabled { + gatewaySecretName = gw.Name + "-node-properties" + } + if gw.Spec.App.Management.SecretName != "" { + gatewaySecretName = gw.Spec.App.Management.SecretName + } + + schemaUpdateMode := "liquibase-only" + if gw.Spec.App.Management.Database.MigrationJob.ClearLocks { + schemaUpdateMode = "liquibase-only-with-unlock" + } + + // EXTRA_JAVA_ARGS is always set explicitly to control the schema update mode. + // SSG_DATABASE_JDBC_URL and Secret mounting depend on the diskless mode — see below. + envVars := []corev1.EnvVar{ + { + Name: "EXTRA_JAVA_ARGS", + Value: "-Dgateway.db.schema-update.mode=" + schemaUpdateMode, + }, + } + + envFrom := []corev1.EnvFromSource{ + { + ConfigMapRef: &corev1.ConfigMapEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: gw.Name}, + }, + }, + } + + var volumes []corev1.Volume + var volumeMounts []corev1.VolumeMount + + // Secret mounting strategy mirrors the main Gateway deployment and matches Helm behavior: + // + // Diskless mode (disklessConfig.disabled: false, the default): + // The Secret is exposed as environment variables via envFrom. The entrypoint reads + // SSG_DATABASE_JDBC_URL directly from env, so migrationJob.jdbcUrl overrides the + // main database.jdbcUrl via Kubernetes env precedence (explicit Env beats envFrom). + // + // Non-diskless mode (disklessConfig.disabled: true): + // The Secret is mounted as a node.properties file at the well-known path the + // entrypoint expects. With DISKLESS_CONFIG=false, the entrypoint calls + // loadFromNodeProperties() and reads l7.mysql.connection.url from the file. + // migrationJob.jdbcUrl is NOT used in this mode — node.properties always wins, + // consistent with Helm behavior when disklessConfig is disabled. + if !gw.Spec.App.Management.DisklessConfig.Disabled { + envFrom = append(envFrom, corev1.EnvFromSource{ + SecretRef: &corev1.SecretEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: gatewaySecretName}, + }, + }) + if gw.Spec.App.Management.Database.MigrationJob.JDBCUrl != "" { + // Append createDatabaseIfNotExist=false so that a misconfigured or mistyped + // jdbcUrl fails fast rather than silently creating an unintended database. + // entrypoint.sh only appends createDatabaseIfNotExist=true when the parameter + // is absent, so an explicit =false here takes precedence. + // Use '?' to start the query string if none exists yet, '&' otherwise. + jdbcUrl := gw.Spec.App.Management.Database.MigrationJob.JDBCUrl + if !strings.Contains(jdbcUrl, "createDatabaseIfNotExist") { + if strings.Contains(jdbcUrl, "?") { + jdbcUrl += "&createDatabaseIfNotExist=false" + } else { + jdbcUrl += "?createDatabaseIfNotExist=false" + } + } + envVars = append(envVars, corev1.EnvVar{ + Name: "SSG_DATABASE_JDBC_URL", + Value: jdbcUrl, + }) + } + } else { + optional := true + defaultMode := int32(0444) + volumes = append(volumes, corev1.Volume{ + Name: gw.Name + "-node-properties", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: gatewaySecretName, + DefaultMode: &defaultMode, + Optional: &optional, + }, + }, + }) + volumeMounts = append(volumeMounts, corev1.VolumeMount{ + Name: gw.Name + "-node-properties", + MountPath: "/opt/SecureSpan/Gateway/node/default/etc/conf/node.properties", + SubPath: "node.properties", + }) + } + + container := corev1.Container{ + Name: "db-migration", + Image: gw.Spec.App.Image, + ImagePullPolicy: gw.Spec.App.ImagePullPolicy, + EnvFrom: envFrom, + Env: envVars, + VolumeMounts: volumeMounts, + } + + var backoffLimit int32 = 1 + var activeDeadlineSeconds int64 = 300 + + if gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds != nil { + activeDeadlineSeconds = *gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds + } + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: gw.Name + "-db-migration", + Namespace: gw.Namespace, + Labels: map[string]string{"app": gw.Name + "-db-migration"}, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: &backoffLimit, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": gw.Name + "-db-migration"}, + }, + Spec: corev1.PodSpec{ + ActiveDeadlineSeconds: &activeDeadlineSeconds, + ServiceAccountName: gw.Spec.App.ServiceAccount.Name, + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{container}, + ImagePullSecrets: gw.Spec.App.ImagePullSecrets, + Volumes: volumes, + }, + }, + }, + } + + return job +} diff --git a/pkg/gateway/reconcile/migration_job.go b/pkg/gateway/reconcile/migration_job.go new file mode 100644 index 00000000..8e615830 --- /dev/null +++ b/pkg/gateway/reconcile/migration_job.go @@ -0,0 +1,126 @@ +/* +* Copyright (c) 2026 Broadcom. All rights reserved. +* The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. +* All trademarks, trade names, service marks, and logos referenced +* herein belong to their respective companies. +* +* This software and all information contained therein is confidential +* and proprietary and shall not be duplicated, used, disclosed or +* disseminated in any way except as authorized by the applicable +* license agreement, without the express written permission of Broadcom. +* All authorized reproductions must be marked with this language. +* +* EXCEPT AS SET FORTH IN THE APPLICABLE LICENSE AGREEMENT, TO THE +* EXTENT PERMITTED BY APPLICABLE LAW OR AS AGREED BY BROADCOM IN ITS +* APPLICABLE LICENSE AGREEMENT, BROADCOM PROVIDES THIS DOCUMENTATION +* "AS IS" WITHOUT WARRANTY OF ANY KIND, INCLUDING WITHOUT LIMITATION, +* ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +* PURPOSE, OR. NONINFRINGEMENT. IN NO EVENT WILL BROADCOM BE LIABLE TO +* THE END USER OR ANY THIRD PARTY FOR ANY LOSS OR DAMAGE, DIRECT OR +* INDIRECT, FROM THE USE OF THIS DOCUMENTATION, INCLUDING WITHOUT LIMITATION, +* LOST PROFITS, LOST INVESTMENT, BUSINESS INTERRUPTION, GOODWILL, OR +* LOST DATA, EVEN IF BROADCOM IS EXPRESSLY ADVISED IN ADVANCE OF THE +* POSSIBILITY OF SUCH LOSS OR DAMAGE. +* +* AI assistance has been used to generate some or all contents of this file. That includes, but is not limited to, new code, modifying existing code, stylistic edits. + */ +package reconcile + +import ( + "context" + "fmt" + + "github.com/caapim/layer7-operator/pkg/gateway" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func GatewayMigrationJob(ctx context.Context, params Params) error { + if !params.Instance.Spec.App.Management.Database.MigrationJob.Enabled { + return nil + } + + desiredJob := gateway.NewMigrationJob(params.Instance) + if err := controllerutil.SetControllerReference(params.Instance, desiredJob, params.Scheme); err != nil { + return fmt.Errorf("failed to set controller reference on migration job: %w", err) + } + + currentJob := &batchv1.Job{} + err := params.Client.Get(ctx, types.NamespacedName{Name: desiredJob.Name, Namespace: desiredJob.Namespace}, currentJob) + + if err != nil && k8serrors.IsNotFound(err) { + if err = params.Client.Create(ctx, desiredJob); err != nil { + return fmt.Errorf("failed creating migration job: %w", err) + } + params.Log.Info("created migration job", "name", desiredJob.Name, "namespace", desiredJob.Namespace) + return fmt.Errorf("migration job started, waiting for completion") + } else if err != nil { + return err + } + + // Replace the job if any migration-relevant spec field changed: image, jdbcUrl, + // clearLocks, or any other env var. This ensures that applying an updated CR + // (e.g. fixing a wrong jdbcUrl or upgrading the image) always triggers a fresh run. + if migrationJobSpecChanged(currentJob, desiredJob) { + params.Log.Info("migration job spec changed, replacing", "name", currentJob.Name) + propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) + if err := params.Client.Delete(ctx, currentJob, propagationPolicy); err != nil { + return fmt.Errorf("failed deleting stale migration job: %w", err) + } + return fmt.Errorf("deleted stale migration job, requeueing to create new one") + } + + if currentJob.Status.Succeeded > 0 { + params.Log.V(2).Info("migration job completed successfully") + return nil + } + + if currentJob.Status.Failed > 0 { + params.Log.Info("migration job failed — Gateway deployment is blocked. Investigate logs then delete the job to retry", + "name", currentJob.Name, + "namespace", currentJob.Namespace, + "fix", "kubectl delete job "+currentJob.Name+" -n "+currentJob.Namespace) + return fmt.Errorf("migration job %s/%s failed — Gateway deployment blocked pending manual intervention", currentJob.Namespace, currentJob.Name) + } + + params.Log.Info("migration job is active, waiting...") + return fmt.Errorf("migration job is active, waiting...") +} + +// migrationJobSpecChanged returns true if any migration-relevant container fields +// differ between the current (in-cluster) job and what the operator would create now. +// Comparing env vars by name→value catches jdbcUrl, clearLocks, and mode changes +// in addition to image upgrades. +func migrationJobSpecChanged(current, desired *batchv1.Job) bool { + if len(current.Spec.Template.Spec.Containers) == 0 || len(desired.Spec.Template.Spec.Containers) == 0 { + return false + } + cc := current.Spec.Template.Spec.Containers[0] + dc := desired.Spec.Template.Spec.Containers[0] + if cc.Image != dc.Image { + return true + } + return envVarsChanged(cc.Env, dc.Env) +} + +// envVarsChanged compares two env var slices by name→value, ignoring order. +func envVarsChanged(current, desired []corev1.EnvVar) bool { + cm := make(map[string]string, len(current)) + for _, e := range current { + cm[e.Name] = e.Value + } + if len(cm) != len(desired) { + return true + } + for _, e := range desired { + if cv, ok := cm[e.Name]; !ok || cv != e.Value { + return true + } + } + return false +} From 97963787cbf9c4fe84ccf8e3e1a538da444477bd Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Mon, 29 Jun 2026 22:33:10 -0700 Subject: [PATCH 02/12] feat: add pre-upgrade database migration job support Introduces an opt-in Kubernetes Job that runs Liquibase schema migrations before Gateway pods roll out during an upgrade. Gateway pods are blocked until the job succeeds, preventing pods from starting against an unmigrated schema. Key behaviors: - Works with Gateway 11.2.2 and later which provides four schema update modes in entrypoint.sh: default, skip, liquibase-only, liquibase-only-with-unlock - Migration job enabled via spec.app.management.database.migrationJob.enabled - clearLocks field releases stale Liquibase locks before migrating - Gateway pods automatically use skip mode when migration job is enabled - Job is auto-replaced when spec changes (image, jdbcUrl, clearLocks) - Failed jobs block the deployment and require manual deletion to retry - jdbcUrl override only applies in diskless mode; node.properties wins in non-diskless mode, consistent with Helm behavior --- config/crd/bases/security.brcmlabs.com_gateways.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/crd/bases/security.brcmlabs.com_gateways.yaml b/config/crd/bases/security.brcmlabs.com_gateways.yaml index a3dad749..ef735797 100644 --- a/config/crd/bases/security.brcmlabs.com_gateways.yaml +++ b/config/crd/bases/security.brcmlabs.com_gateways.yaml @@ -3550,8 +3550,8 @@ spec: description: Enabled or disabled type: boolean jdbcUrl: - description: JDBCUrl for the migration job (useful - for bypassing proxies) + description: JDBCUrl overrides the main database.jdbcUrl + for the migration job (e.g. type: string type: object password: From a3ad63619857871859e874f07da9b7d99e8a5a1c Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Mon, 29 Jun 2026 23:37:18 -0700 Subject: [PATCH 03/12] fix bugs. Improve error handling. --- api/v1/gateway_types.go | 4 +- api/v1/gateway_webhook.go | 6 ++ .../bases/security.brcmlabs.com_gateways.yaml | 2 +- internal/controller/gateway/controller.go | 8 ++ pkg/gateway/migration_job.go | 21 ++++- pkg/gateway/reconcile/migration_job.go | 94 ++++++++++++++++--- 6 files changed, 117 insertions(+), 18 deletions(-) diff --git a/api/v1/gateway_types.go b/api/v1/gateway_types.go index ac19ea24..60194922 100644 --- a/api/v1/gateway_types.go +++ b/api/v1/gateway_types.go @@ -706,7 +706,9 @@ type MigrationJob struct { // schema updates. Use with caution: forcefully releasing a lock while // another process is actively updating the schema can corrupt the database. ClearLocks bool `json:"clearLocks,omitempty"` - // ActiveDeadlineSeconds is the max duration the job is allowed to run + // ActiveDeadlineSeconds is the max duration each pod attempt is allowed to run. + // The job may retry once (backoffLimit=1), so total elapsed time can be up to + // 2× this value. Defaults to 300 seconds (5 minutes) per attempt. ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"` } diff --git a/api/v1/gateway_webhook.go b/api/v1/gateway_webhook.go index 705ba7c1..e1d05434 100644 --- a/api/v1/gateway_webhook.go +++ b/api/v1/gateway_webhook.go @@ -1,3 +1,4 @@ +// Copyright (c) 2026 Broadcom Inc. and its subsidiaries. All Rights Reserved. /* Copyright 2021. @@ -193,6 +194,11 @@ func validateGateway(r *Gateway) (admission.Warnings, error) { } } + if r.Spec.App.Management.Database.MigrationJob.Enabled && + !r.Spec.App.Management.Database.Enabled { + return warnings, fmt.Errorf("migrationJob.enabled requires database.enabled: true") + } + if r.Spec.App.Management.Service.Enabled { if r.Spec.App.Management.Service.Type != v1.ServiceTypeClusterIP && r.Spec.App.Management.Service.Type != v1.ServiceTypeLoadBalancer && r.Spec.App.Management.Service.Type != v1.ServiceTypeNodePort { return warnings, fmt.Errorf("please specify a valid management service type, valid types are LoadBalancer, ClusterIP and NodePort") diff --git a/config/crd/bases/security.brcmlabs.com_gateways.yaml b/config/crd/bases/security.brcmlabs.com_gateways.yaml index ef735797..50c6c02a 100644 --- a/config/crd/bases/security.brcmlabs.com_gateways.yaml +++ b/config/crd/bases/security.brcmlabs.com_gateways.yaml @@ -3539,7 +3539,7 @@ spec: properties: activeDeadlineSeconds: description: ActiveDeadlineSeconds is the max duration - the job is allowed to run + each pod attempt is allowed to... format: int64 type: integer clearLocks: diff --git a/internal/controller/gateway/controller.go b/internal/controller/gateway/controller.go index a4748aac..60544014 100644 --- a/internal/controller/gateway/controller.go +++ b/internal/controller/gateway/controller.go @@ -30,6 +30,7 @@ package gateway import ( "context" "encoding/json" + "errors" "fmt" "reflect" "strings" @@ -134,6 +135,13 @@ func (r *GatewayReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct start := time.Now() for _, op := range ops { if err = r.runOp(ctx, params, op); err != nil { + // ErrMigrationPending is a normal "not ready yet" state, not a failure. + // Requeue at a fixed short interval so the operator polls for job + // completion without applying exponential backoff or incrementing + // failure metrics. + if errors.Is(err, reconcile.ErrMigrationPending) { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } _ = captureMetrics(ctx, params, start, true, op.Name) return ctrl.Result{}, err } diff --git a/pkg/gateway/migration_job.go b/pkg/gateway/migration_job.go index fba65549..ee73c8df 100644 --- a/pkg/gateway/migration_job.go +++ b/pkg/gateway/migration_job.go @@ -108,7 +108,11 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { }) } } else { - optional := true + // Fix #5: optional must be false to match the main Gateway deployment. + // With optional: true, Kubernetes schedules the pod even when the Secret + // is missing, causing a silent empty mount and a confusing DB connection + // error instead of a clear "secret not found" Kubernetes event. + optional := false defaultMode := int32(0444) volumes = append(volumes, corev1.Volume{ Name: gw.Name + "-node-properties", @@ -143,6 +147,19 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { activeDeadlineSeconds = *gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds } + // Fix #9: apply the same ServiceAccount fallback logic as the main deployment + // so the migration pod runs under the same identity as Gateway pods. + // Without this, pods default to the namespace "default" service account when + // spec.app.serviceAccount.name is not explicitly set, which can fail in clusters + // with restrictive default SA policies. + serviceAccountName := gw.Spec.App.ServiceAccount.Name + if gw.Spec.App.ServiceAccount.Name == "" { + serviceAccountName = gw.Name + } + if gw.Spec.App.ServiceAccount == (securityv1.ServiceAccount{}) { + serviceAccountName = "default" + } + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: gw.Name + "-db-migration", @@ -157,7 +174,7 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { }, Spec: corev1.PodSpec{ ActiveDeadlineSeconds: &activeDeadlineSeconds, - ServiceAccountName: gw.Spec.App.ServiceAccount.Name, + ServiceAccountName: serviceAccountName, RestartPolicy: corev1.RestartPolicyNever, Containers: []corev1.Container{container}, ImagePullSecrets: gw.Spec.App.ImagePullSecrets, diff --git a/pkg/gateway/reconcile/migration_job.go b/pkg/gateway/reconcile/migration_job.go index 8e615830..a45ecac4 100644 --- a/pkg/gateway/reconcile/migration_job.go +++ b/pkg/gateway/reconcile/migration_job.go @@ -28,7 +28,9 @@ package reconcile import ( "context" + "errors" "fmt" + "reflect" "github.com/caapim/layer7-operator/pkg/gateway" batchv1 "k8s.io/api/batch/v1" @@ -40,8 +42,30 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) +// ErrMigrationPending is returned by GatewayMigrationJob when the migration job +// exists and has not yet completed (created, replaced, or still active). The +// controller treats this as a normal "not ready yet" state: it requeues with a +// fixed short interval rather than applying exponential backoff or incrementing +// failure metrics. +var ErrMigrationPending = errors.New("migration pending") + func GatewayMigrationJob(ctx context.Context, params Params) error { - if !params.Instance.Spec.App.Management.Database.MigrationJob.Enabled { + migrationEnabled := params.Instance.Spec.App.Management.Database.MigrationJob.Enabled + databaseEnabled := params.Instance.Spec.App.Management.Database.Enabled + + // Fix #2/#8: when the migration job is disabled (or database is disabled), + // clean up any leftover job so that toggling enabled→false doesn't orphan + // a completed or failed Job in the namespace. + if !migrationEnabled || !databaseEnabled { + existingJob := &batchv1.Job{} + jobName := params.Instance.Name + "-db-migration" + if err := params.Client.Get(ctx, types.NamespacedName{ + Name: jobName, + Namespace: params.Instance.Namespace, + }, existingJob); err == nil { + propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) + _ = params.Client.Delete(ctx, existingJob, propagationPolicy) + } return nil } @@ -58,21 +82,21 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return fmt.Errorf("failed creating migration job: %w", err) } params.Log.Info("created migration job", "name", desiredJob.Name, "namespace", desiredJob.Namespace) - return fmt.Errorf("migration job started, waiting for completion") + return fmt.Errorf("%w: migration job started, waiting for completion", ErrMigrationPending) } else if err != nil { return err } - // Replace the job if any migration-relevant spec field changed: image, jdbcUrl, - // clearLocks, or any other env var. This ensures that applying an updated CR - // (e.g. fixing a wrong jdbcUrl or upgrading the image) always triggers a fresh run. + // Replace the job if any migration-relevant spec field changed. + // This covers image, env vars (jdbcUrl, clearLocks mode), EnvFrom (secretName + // rotation), Volumes (non-diskless secret path), and activeDeadlineSeconds. if migrationJobSpecChanged(currentJob, desiredJob) { params.Log.Info("migration job spec changed, replacing", "name", currentJob.Name) propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) if err := params.Client.Delete(ctx, currentJob, propagationPolicy); err != nil { return fmt.Errorf("failed deleting stale migration job: %w", err) } - return fmt.Errorf("deleted stale migration job, requeueing to create new one") + return fmt.Errorf("%w: deleted stale migration job, requeueing to create new one", ErrMigrationPending) } if currentJob.Status.Succeeded > 0 { @@ -80,7 +104,11 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return nil } - if currentJob.Status.Failed > 0 { + // Only declare terminal failure when no pod attempt is still active. + // With backoffLimit=1, Kubernetes briefly shows Active=1, Failed=1 while + // pod-2 (the retry) is running. Declaring failure at that point would kill + // a healthy retry and require unnecessary manual intervention. + if currentJob.Status.Active == 0 && currentJob.Status.Failed > 0 { params.Log.Info("migration job failed — Gateway deployment is blocked. Investigate logs then delete the job to retry", "name", currentJob.Name, "namespace", currentJob.Namespace, @@ -88,16 +116,24 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return fmt.Errorf("migration job %s/%s failed — Gateway deployment blocked pending manual intervention", currentJob.Namespace, currentJob.Name) } + // Job is still active (or briefly between pod attempts). Return ErrMigrationPending + // so the controller requeues at a fixed interval without incrementing failure metrics. params.Log.Info("migration job is active, waiting...") - return fmt.Errorf("migration job is active, waiting...") + return fmt.Errorf("%w: migration job is active", ErrMigrationPending) } -// migrationJobSpecChanged returns true if any migration-relevant container fields -// differ between the current (in-cluster) job and what the operator would create now. -// Comparing env vars by name→value catches jdbcUrl, clearLocks, and mode changes -// in addition to image upgrades. +// migrationJobSpecChanged returns true if any migration-relevant field differs +// between the current (in-cluster) job and what the operator would create now. +// Compares image, env vars, EnvFrom (covers secretName changes), Volumes (covers +// non-diskless secret path changes), and activeDeadlineSeconds. func migrationJobSpecChanged(current, desired *batchv1.Job) bool { - if len(current.Spec.Template.Spec.Containers) == 0 || len(desired.Spec.Template.Spec.Containers) == 0 { + // A zero-container current job is malformed — force replacement so + // it doesn't permanently block the Deployment step. + if len(current.Spec.Template.Spec.Containers) == 0 { + return true + } + // desired is always built by NewMigrationJob and always has one container. + if len(desired.Spec.Template.Spec.Containers) == 0 { return false } cc := current.Spec.Template.Spec.Containers[0] @@ -105,7 +141,26 @@ func migrationJobSpecChanged(current, desired *batchv1.Job) bool { if cc.Image != dc.Image { return true } - return envVarsChanged(cc.Env, dc.Env) + if envVarsChanged(cc.Env, dc.Env) { + return true + } + // Compare EnvFrom so a secretName rotation triggers job replacement. + if !reflect.DeepEqual(cc.EnvFrom, dc.EnvFrom) { + return true + } + // Compare Volumes so non-diskless secret path changes are detected. + if !reflect.DeepEqual(current.Spec.Template.Spec.Volumes, desired.Spec.Template.Spec.Volumes) { + return true + } + // Detect activeDeadlineSeconds changes so users can tune slow migrations + // without having to manually delete the failed job. + if !int64PtrEqual( + current.Spec.Template.Spec.ActiveDeadlineSeconds, + desired.Spec.Template.Spec.ActiveDeadlineSeconds, + ) { + return true + } + return false } // envVarsChanged compares two env var slices by name→value, ignoring order. @@ -124,3 +179,14 @@ func envVarsChanged(current, desired []corev1.EnvVar) bool { } return false } + +// int64PtrEqual returns true if both pointers are nil or both point to equal values. +func int64PtrEqual(a, b *int64) bool { + if a == nil && b == nil { + return true + } + if a == nil || b == nil { + return false + } + return *a == *b +} From b15b0f31f156cebe91a4af15abec8f8fd81d555a Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Mon, 29 Jun 2026 23:41:05 -0700 Subject: [PATCH 04/12] fix to modify / characters in branch names to be replaced with hyphen. --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7a325314..f64c43f6 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,10 @@ endif OPERATOR_SDK_VERSION ?= v1.33.0 # Image URL to use all building/pushing image targets #IMG ?= docker.io/layer7api/layer7-operator:v$(VERSION) -IMG ?= $(IMAGE_TAG_BASE):$(VERSION) +# Sanitize VERSION for use as a Docker image tag: slashes (from branch names like +# feature/foo) are not valid in tags and must be replaced with hyphens. +DOCKER_VERSION := $(subst /,-,$(VERSION)) +IMG ?= $(IMAGE_TAG_BASE):$(DOCKER_VERSION) # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.30.0 From f595a6f034c7db27aafa531b157162274867a60c Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Tue, 30 Jun 2026 16:13:23 -0700 Subject: [PATCH 05/12] =?UTF-8?q?Replace=20job-existence-based=20migration?= =?UTF-8?q?=20tracking=20with=20persistent=20status=20in=20Gateway.Status.?= =?UTF-8?q?MigrationStatus.=20This=20addresses=20issue=20with=20previous?= =?UTF-8?q?=20implementation=20would=20re-run=20migrations=20on=20every=20?= =?UTF-8?q?12-hour=20reconcile=20if=20the=20completed=20Job=20had=20been?= =?UTF-8?q?=20deleted.=20Changes:=20-=20Add=20MigrationStatus=20struct=20(?= =?UTF-8?q?SpecHash,=20Complete)=20to=20GatewayStatus=20in=20=20=20gateway?= =?UTF-8?q?=5Ftypes.go.=20Once=20Complete=20is=20true,=20GatewayMigrationJ?= =?UTF-8?q?ob=20returns=20=20=20immediately=20on=20all=20future=20reconcil?= =?UTF-8?q?es=20regardless=20of=20Job=20existence.=20-=20Add=20DeepCopyInt?= =?UTF-8?q?o/DeepCopy=20for=20MigrationStatus=20in=20zz=5Fgenerated.deepco?= =?UTF-8?q?py.go.=20-=20Update=20CRD=20schema=20(security.brcmlabs.com=5Fg?= =?UTF-8?q?ateways.yaml)=20to=20include=20the=20=20=20new=20migrationStatu?= =?UTF-8?q?s=20status=20fields.=20-=20Rewrite=20GatewayMigrationJob=20in?= =?UTF-8?q?=20reconcile/migration=5Fjob.go:=20=20=20-=20Compute=20a=2016-c?= =?UTF-8?q?har=20spec=20hash=20from=20image,=20effective=20jdbcUrl,=20clea?= =?UTF-8?q?rLocks,=20=20=20=20=20and=20activeDeadlineSeconds.=20A=20hash?= =?UTF-8?q?=20change=20(e.g.=20image=20upgrade)=20resets=20=20=20=20=20sta?= =?UTF-8?q?tus=20and=20triggers=20a=20fresh=20migration=20automatically.?= =?UTF-8?q?=20=20=20-=20On=20first=20enable:=20write=20hash=20to=20status,?= =?UTF-8?q?=20create=20Job,=20wait.=20=20=20-=20On=20Job=20success:=20writ?= =?UTF-8?q?e=20Complete=3Dtrue=20to=20status,=20unblock=20Deployment.=20?= =?UTF-8?q?=20=20-=20On=20Job=20failure=20(both=20pod=20attempts=20exhaust?= =?UTF-8?q?ed):=20log=20error=20with=20exact=20=20=20=20=20kubectl=20delet?= =?UTF-8?q?e=20command,=20block=20Deployment.=20User=20deletes=20Job=20to?= =?UTF-8?q?=20retry.=20=20=20-=20On=20disabled:=20clean=20up=20any=20orpha?= =?UTF-8?q?ned=20Job.=20=20=20-=20Remove=20migrationJobSpecChanged=20?= =?UTF-8?q?=E2=80=94=20spec=20change=20detection=20is=20now=20fully=20=20?= =?UTF-8?q?=20=20=20handled=20by=20the=20hash=20comparison.=20Recovery=20a?= =?UTF-8?q?fter=20failure:=20fix=20the=20root=20cause=20(run=20kubectl=20p?= =?UTF-8?q?atch=20with=20clearLocks:true=20or=20restore=20DB=20to=20pre-up?= =?UTF-8?q?grade=20state=20if=20partially=20migrated)=20,=20then:=20=20=20?= =?UTF-8?q?kubectl=20delete=20job=20-db-migration=20-n=20?= =?UTF-8?q?=20The=20operator=20automatically=20creates=20a=20new=20Job.=20?= =?UTF-8?q?No=20kubectl=20apply=20needed.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/v1/gateway_types.go | 16 ++ api/v1/zz_generated.deepcopy.go | 16 ++ .../bases/security.brcmlabs.com_gateways.yaml | 13 ++ pkg/gateway/reconcile/migration_job.go | 182 ++++++++++-------- 4 files changed, 144 insertions(+), 83 deletions(-) diff --git a/api/v1/gateway_types.go b/api/v1/gateway_types.go index 60194922..ffcdee84 100644 --- a/api/v1/gateway_types.go +++ b/api/v1/gateway_types.go @@ -65,6 +65,20 @@ type GatewayList struct { Items []Gateway `json:"items"` } +// MigrationStatus tracks the state of the pre-upgrade database migration job. +// It is persisted in Gateway.Status so that completed migrations survive job +// deletion and are not re-run on daily reconciles or operator restarts. +type MigrationStatus struct { + // SpecHash is a short hash of the migration-relevant spec fields (image, + // effective jdbcUrl, clearLocks). When any of these change the hash changes + // and a fresh migration job is triggered. + SpecHash string `json:"specHash,omitempty"` + // Complete indicates that the migration job succeeded for the current SpecHash. + // Once true, GatewayMigrationJob skips job management entirely and the + // Deployment step is unblocked — regardless of whether the Job still exists. + Complete bool `json:"complete,omitempty"` +} + // GatewayStatus defines the observed state of Gateways type GatewayStatus struct { // Host is the Gateway Cluster Hostname @@ -120,6 +134,8 @@ type GatewayStatus struct { LastAppliedExternalCerts map[string][]string `json:"lastAppliedExternalCerts,omitempty"` // LastAppliedOtkFipsCerts tracks which OTK FIPS user certificates have been applied LastAppliedOtkFipsCerts map[string][]string `json:"lastAppliedOtkFipsCerts,omitempty"` + // MigrationStatus tracks the state of the pre-upgrade database migration job. + MigrationStatus MigrationStatus `json:"migrationStatus,omitempty"` } // GatewayState tracks the status of Gateway Resources diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 23844085..f4746dc9 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -740,6 +740,7 @@ func (in *GatewayStatus) DeepCopyInto(out *GatewayStatus) { (*out)[key] = outVal } } + out.MigrationStatus = in.MigrationStatus } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GatewayStatus. @@ -1025,6 +1026,21 @@ func (in *MigrationJob) DeepCopy() *MigrationJob { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MigrationStatus) DeepCopyInto(out *MigrationStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationStatus. +func (in *MigrationStatus) DeepCopy() *MigrationStatus { + if in == nil { + return nil + } + out := new(MigrationStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Notification) DeepCopyInto(out *Notification) { *out = *in diff --git a/config/crd/bases/security.brcmlabs.com_gateways.yaml b/config/crd/bases/security.brcmlabs.com_gateways.yaml index 50c6c02a..a767d624 100644 --- a/config/crd/bases/security.brcmlabs.com_gateways.yaml +++ b/config/crd/bases/security.brcmlabs.com_gateways.yaml @@ -6785,6 +6785,19 @@ spec: description: Management Pod is a Gateway with a special annotation is used as a... type: string + migrationStatus: + description: MigrationStatus tracks the state of the pre-upgrade database + migration job. + properties: + complete: + description: Complete indicates that the migration job succeeded + for the current... + type: boolean + specHash: + description: SpecHash is a short hash of the migration-relevant + spec fields (image,... + type: string + type: object phase: description: PodPhase is a label for the condition of a pod at the current time. diff --git a/pkg/gateway/reconcile/migration_job.go b/pkg/gateway/reconcile/migration_job.go index a45ecac4..2f1e3a1b 100644 --- a/pkg/gateway/reconcile/migration_job.go +++ b/pkg/gateway/reconcile/migration_job.go @@ -28,13 +28,14 @@ package reconcile import ( "context" + "crypto/sha256" "errors" "fmt" - "reflect" + "strconv" + securityv1 "github.com/caapim/layer7-operator/api/v1" "github.com/caapim/layer7-operator/pkg/gateway" batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -49,13 +50,22 @@ import ( // failure metrics. var ErrMigrationPending = errors.New("migration pending") +// GatewayMigrationJob manages the lifecycle of the pre-upgrade database migration +// job. It uses Gateway.Status.MigrationStatus as the authoritative source of truth +// for whether the migration has completed, so that: +// +// - Completed migrations are not re-run on daily reconciles or operator restarts. +// - Completed migrations survive Job deletion/GC without triggering a re-run. +// - A spec change (image upgrade, jdbcUrl correction, clearLocks toggle) resets +// the status and triggers a fresh migration run automatically. +// +// The Deployment step is only unblocked when Status.MigrationStatus.Complete is true. func GatewayMigrationJob(ctx context.Context, params Params) error { migrationEnabled := params.Instance.Spec.App.Management.Database.MigrationJob.Enabled databaseEnabled := params.Instance.Spec.App.Management.Database.Enabled - // Fix #2/#8: when the migration job is disabled (or database is disabled), - // clean up any leftover job so that toggling enabled→false doesn't orphan - // a completed or failed Job in the namespace. + // When migration is disabled (or database is disabled), clean up any leftover + // job so that toggling enabled→false doesn't orphan a job in the namespace. if !migrationEnabled || !databaseEnabled { existingJob := &batchv1.Job{} jobName := params.Instance.Name + "-db-migration" @@ -69,6 +79,46 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return nil } + desiredHash := migrationSpecHash(params.Instance) + currentStatus := params.Instance.Status.MigrationStatus + + // Fast path: migration already completed for the current spec. + // This allows daily reconciles and operator restarts to skip migration + // management entirely — the status persists regardless of Job existence. + if currentStatus.SpecHash == desiredHash && currentStatus.Complete { + params.Log.V(2).Info("migration already complete for current spec, skipping", + "hash", desiredHash) + return nil + } + + // Spec changed (image upgrade, jdbcUrl correction, clearLocks toggle, etc.): + // reset the status and delete the old job so a fresh migration runs. + if currentStatus.SpecHash != "" && currentStatus.SpecHash != desiredHash { + params.Log.Info("migration spec changed, resetting status and replacing job", + "oldHash", currentStatus.SpecHash, + "newHash", desiredHash) + if err := setMigrationStatus(ctx, params, securityv1.MigrationStatus{SpecHash: desiredHash}); err != nil { + return err + } + oldJob := &batchv1.Job{} + jobName := params.Instance.Name + "-db-migration" + if getErr := params.Client.Get(ctx, types.NamespacedName{Name: jobName, Namespace: params.Instance.Namespace}, oldJob); getErr == nil { + propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) + if delErr := params.Client.Delete(ctx, oldJob, propagationPolicy); delErr != nil { + return fmt.Errorf("failed deleting stale migration job: %w", delErr) + } + } + return fmt.Errorf("%w: migration spec changed, requeueing to create new job", ErrMigrationPending) + } + + // Record the desired hash in status if this is the first time we see it. + if currentStatus.SpecHash == "" { + if err := setMigrationStatus(ctx, params, securityv1.MigrationStatus{SpecHash: desiredHash}); err != nil { + return err + } + } + + // Build the desired Job and look for the current in-cluster Job. desiredJob := gateway.NewMigrationJob(params.Instance) if err := controllerutil.SetControllerReference(params.Instance, desiredJob, params.Scheme); err != nil { return fmt.Errorf("failed to set controller reference on migration job: %w", err) @@ -78,6 +128,8 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { err := params.Client.Get(ctx, types.NamespacedName{Name: desiredJob.Name, Namespace: desiredJob.Namespace}, currentJob) if err != nil && k8serrors.IsNotFound(err) { + // Job does not exist — create it. This covers both the initial run and + // the user-deletes-failed-job-to-retry flow. if err = params.Client.Create(ctx, desiredJob); err != nil { return fmt.Errorf("failed creating migration job: %w", err) } @@ -87,27 +139,25 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return err } - // Replace the job if any migration-relevant spec field changed. - // This covers image, env vars (jdbcUrl, clearLocks mode), EnvFrom (secretName - // rotation), Volumes (non-diskless secret path), and activeDeadlineSeconds. - if migrationJobSpecChanged(currentJob, desiredJob) { - params.Log.Info("migration job spec changed, replacing", "name", currentJob.Name) - propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) - if err := params.Client.Delete(ctx, currentJob, propagationPolicy); err != nil { - return fmt.Errorf("failed deleting stale migration job: %w", err) - } - return fmt.Errorf("%w: deleted stale migration job, requeueing to create new one", ErrMigrationPending) - } - + // Job succeeded: persist completion in status so future reconciles skip migration. + // Because the controller owns the Job (Owns(&batchv1.Job{})), Job completion + // triggers a reconcile event, which is how we reach this branch promptly. if currentJob.Status.Succeeded > 0 { - params.Log.V(2).Info("migration job completed successfully") + params.Log.Info("migration job completed successfully, updating status", + "name", currentJob.Name, "namespace", currentJob.Namespace) + if err := setMigrationStatus(ctx, params, securityv1.MigrationStatus{ + SpecHash: desiredHash, + Complete: true, + }); err != nil { + return err + } return nil } // Only declare terminal failure when no pod attempt is still active. // With backoffLimit=1, Kubernetes briefly shows Active=1, Failed=1 while - // pod-2 (the retry) is running. Declaring failure at that point would kill - // a healthy retry and require unnecessary manual intervention. + // pod-2 (the retry) is running. Declaring failure then would kill a healthy + // retry and require unnecessary manual intervention. if currentJob.Status.Active == 0 && currentJob.Status.Failed > 0 { params.Log.Info("migration job failed — Gateway deployment is blocked. Investigate logs then delete the job to retry", "name", currentJob.Name, @@ -116,77 +166,43 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return fmt.Errorf("migration job %s/%s failed — Gateway deployment blocked pending manual intervention", currentJob.Namespace, currentJob.Name) } - // Job is still active (or briefly between pod attempts). Return ErrMigrationPending + // Job is still running (or briefly between pod attempts). Return ErrMigrationPending // so the controller requeues at a fixed interval without incrementing failure metrics. params.Log.Info("migration job is active, waiting...") return fmt.Errorf("%w: migration job is active", ErrMigrationPending) } -// migrationJobSpecChanged returns true if any migration-relevant field differs -// between the current (in-cluster) job and what the operator would create now. -// Compares image, env vars, EnvFrom (covers secretName changes), Volumes (covers -// non-diskless secret path changes), and activeDeadlineSeconds. -func migrationJobSpecChanged(current, desired *batchv1.Job) bool { - // A zero-container current job is malformed — force replacement so - // it doesn't permanently block the Deployment step. - if len(current.Spec.Template.Spec.Containers) == 0 { - return true - } - // desired is always built by NewMigrationJob and always has one container. - if len(desired.Spec.Template.Spec.Containers) == 0 { - return false - } - cc := current.Spec.Template.Spec.Containers[0] - dc := desired.Spec.Template.Spec.Containers[0] - if cc.Image != dc.Image { - return true - } - if envVarsChanged(cc.Env, dc.Env) { - return true - } - // Compare EnvFrom so a secretName rotation triggers job replacement. - if !reflect.DeepEqual(cc.EnvFrom, dc.EnvFrom) { - return true - } - // Compare Volumes so non-diskless secret path changes are detected. - if !reflect.DeepEqual(current.Spec.Template.Spec.Volumes, desired.Spec.Template.Spec.Volumes) { - return true - } - // Detect activeDeadlineSeconds changes so users can tune slow migrations - // without having to manually delete the failed job. - if !int64PtrEqual( - current.Spec.Template.Spec.ActiveDeadlineSeconds, - desired.Spec.Template.Spec.ActiveDeadlineSeconds, - ) { - return true - } - return false -} +// migrationSpecHash returns a 16-character hex hash of the migration-relevant spec +// fields: image, effective jdbcUrl, clearLocks flag, and activeDeadlineSeconds. +// When any of these change the hash changes and a fresh migration job is triggered. +func migrationSpecHash(gw *securityv1.Gateway) string { + h := sha256.New() + h.Write([]byte(gw.Spec.App.Image)) -// envVarsChanged compares two env var slices by name→value, ignoring order. -func envVarsChanged(current, desired []corev1.EnvVar) bool { - cm := make(map[string]string, len(current)) - for _, e := range current { - cm[e.Name] = e.Value - } - if len(cm) != len(desired) { - return true + // Use the migration-specific jdbcUrl when set; otherwise fall back to the + // main database URL. This mirrors what NewMigrationJob passes to the container. + jdbcUrl := gw.Spec.App.Management.Database.JDBCUrl + if gw.Spec.App.Management.Database.MigrationJob.JDBCUrl != "" { + jdbcUrl = gw.Spec.App.Management.Database.MigrationJob.JDBCUrl } - for _, e := range desired { - if cv, ok := cm[e.Name]; !ok || cv != e.Value { - return true - } + h.Write([]byte(jdbcUrl)) + + h.Write([]byte(strconv.FormatBool(gw.Spec.App.Management.Database.MigrationJob.ClearLocks))) + + if gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds != nil { + h.Write([]byte(strconv.FormatInt(*gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds, 10))) } - return false + + return fmt.Sprintf("%x", h.Sum(nil))[:16] } -// int64PtrEqual returns true if both pointers are nil or both point to equal values. -func int64PtrEqual(a, b *int64) bool { - if a == nil && b == nil { - return true - } - if a == nil || b == nil { - return false - } - return *a == *b +// setMigrationStatus persists the given MigrationStatus into Gateway.Status. +// It mutates params.Instance.Status in-place so that the running reconcile sees +// the updated value without a re-fetch. +func setMigrationStatus(ctx context.Context, params Params, status securityv1.MigrationStatus) error { + params.Instance.Status.MigrationStatus = status + if err := params.Client.Status().Update(ctx, params.Instance); err != nil { + return fmt.Errorf("failed to update migration status: %w", err) + } + return nil } From 994e2f413e29eb1dec957de0a180e46b912010d2 Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Tue, 30 Jun 2026 17:19:36 -0700 Subject: [PATCH 06/12] improve re-use of the MigrationJobName function. --- pkg/gateway/migration_job.go | 14 ++- pkg/gateway/reconcile/migration_job.go | 127 ++++++++++++++++++------- 2 files changed, 104 insertions(+), 37 deletions(-) diff --git a/pkg/gateway/migration_job.go b/pkg/gateway/migration_job.go index ee73c8df..211a132c 100644 --- a/pkg/gateway/migration_job.go +++ b/pkg/gateway/migration_job.go @@ -35,6 +35,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// MigrationJobName returns the canonical name of the migration Job for a Gateway. +// Exported so pkg/gateway/reconcile can derive the same name without duplicating +// the "-db-migration" suffix convention. +func MigrationJobName(gw *securityv1.Gateway) string { + return gw.Name + "-db-migration" +} + func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { gatewaySecretName := gw.Name if gw.Spec.App.Management.DisklessConfig.Disabled { @@ -160,17 +167,18 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { serviceAccountName = "default" } + jobName := MigrationJobName(gw) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ - Name: gw.Name + "-db-migration", + Name: jobName, Namespace: gw.Namespace, - Labels: map[string]string{"app": gw.Name + "-db-migration"}, + Labels: map[string]string{"app": jobName}, }, Spec: batchv1.JobSpec{ BackoffLimit: &backoffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"app": gw.Name + "-db-migration"}, + Labels: map[string]string{"app": jobName}, }, Spec: corev1.PodSpec{ ActiveDeadlineSeconds: &activeDeadlineSeconds, diff --git a/pkg/gateway/reconcile/migration_job.go b/pkg/gateway/reconcile/migration_job.go index 2f1e3a1b..26f2956f 100644 --- a/pkg/gateway/reconcile/migration_job.go +++ b/pkg/gateway/reconcile/migration_job.go @@ -54,10 +54,10 @@ var ErrMigrationPending = errors.New("migration pending") // job. It uses Gateway.Status.MigrationStatus as the authoritative source of truth // for whether the migration has completed, so that: // -// - Completed migrations are not re-run on daily reconciles or operator restarts. +// - Completed migrations are not re-run on 12-hour reconciles or operator restarts. // - Completed migrations survive Job deletion/GC without triggering a re-run. -// - A spec change (image upgrade, jdbcUrl correction, clearLocks toggle) resets -// the status and triggers a fresh migration run automatically. +// - A spec change (image upgrade, jdbcUrl correction, clearLocks toggle, etc.) +// resets the status and triggers a fresh migration run automatically. // // The Deployment step is only unblocked when Status.MigrationStatus.Complete is true. func GatewayMigrationJob(ctx context.Context, params Params) error { @@ -67,15 +67,12 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { // When migration is disabled (or database is disabled), clean up any leftover // job so that toggling enabled→false doesn't orphan a job in the namespace. if !migrationEnabled || !databaseEnabled { - existingJob := &batchv1.Job{} - jobName := params.Instance.Name + "-db-migration" - if err := params.Client.Get(ctx, types.NamespacedName{ - Name: jobName, + staleJob := &batchv1.Job{ObjectMeta: metav1.ObjectMeta{ + Name: gateway.MigrationJobName(params.Instance), Namespace: params.Instance.Namespace, - }, existingJob); err == nil { - propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) - _ = params.Client.Delete(ctx, existingJob, propagationPolicy) - } + }} + propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationBackground) + _ = client.IgnoreNotFound(params.Client.Delete(ctx, staleJob, propagationPolicy)) return nil } @@ -83,7 +80,7 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { currentStatus := params.Instance.Status.MigrationStatus // Fast path: migration already completed for the current spec. - // This allows daily reconciles and operator restarts to skip migration + // This allows 12-hour reconciles and operator restarts to skip migration // management entirely — the status persists regardless of Job existence. if currentStatus.SpecHash == desiredHash && currentStatus.Complete { params.Log.V(2).Info("migration already complete for current spec, skipping", @@ -91,39 +88,53 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return nil } - // Spec changed (image upgrade, jdbcUrl correction, clearLocks toggle, etc.): - // reset the status and delete the old job so a fresh migration runs. + // Spec changed (image upgrade, jdbcUrl correction, clearLocks/secretName/ + // disklessConfig toggle, etc.): delete the old Job first, then update status. + // + // Ordering matters: writing status first would update SpecHash to + // desiredHash before deletion, making this branch unreachable on the next + // reconcile if the deletion failed — turning a retryable error into a silent + // permanent skip. Instead we delete first and only write status after the + // delete succeeds or the job is already gone. + // + // No preceding Get (Fix 6): Delete with IgnoreNotFound is one fewer API + // round-trip and removes a swallowed-error opportunity. + // + // Background propagation (Fix 2): removes the Job object from etcd immediately + // so the next reconcile's Get cannot return the old (stale) object while it + // is mid-teardown, closing the foreground-deletion timing race. if currentStatus.SpecHash != "" && currentStatus.SpecHash != desiredHash { - params.Log.Info("migration spec changed, resetting status and replacing job", + params.Log.Info("migration spec changed, replacing job", "oldHash", currentStatus.SpecHash, "newHash", desiredHash) + staleJob := &batchv1.Job{ObjectMeta: metav1.ObjectMeta{ + Name: gateway.MigrationJobName(params.Instance), + Namespace: params.Instance.Namespace, + }} + propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationBackground) + if delErr := params.Client.Delete(ctx, staleJob, propagationPolicy); client.IgnoreNotFound(delErr) != nil { + return fmt.Errorf("failed deleting stale migration job: %w", delErr) + } if err := setMigrationStatus(ctx, params, securityv1.MigrationStatus{SpecHash: desiredHash}); err != nil { return err } - oldJob := &batchv1.Job{} - jobName := params.Instance.Name + "-db-migration" - if getErr := params.Client.Get(ctx, types.NamespacedName{Name: jobName, Namespace: params.Instance.Namespace}, oldJob); getErr == nil { - propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationForeground) - if delErr := params.Client.Delete(ctx, oldJob, propagationPolicy); delErr != nil { - return fmt.Errorf("failed deleting stale migration job: %w", delErr) - } - } return fmt.Errorf("%w: migration spec changed, requeueing to create new job", ErrMigrationPending) } - // Record the desired hash in status if this is the first time we see it. + // Build the desired Job now — needed both for creation and for the live + // sanity check (Fix 4+5) further below. + desiredJob := gateway.NewMigrationJob(params.Instance) + if err := controllerutil.SetControllerReference(params.Instance, desiredJob, params.Scheme); err != nil { + return fmt.Errorf("failed to set controller reference on migration job: %w", err) + } + + // Record the desired hash in status on first enable (SpecHash is empty). if currentStatus.SpecHash == "" { if err := setMigrationStatus(ctx, params, securityv1.MigrationStatus{SpecHash: desiredHash}); err != nil { return err } } - // Build the desired Job and look for the current in-cluster Job. - desiredJob := gateway.NewMigrationJob(params.Instance) - if err := controllerutil.SetControllerReference(params.Instance, desiredJob, params.Scheme); err != nil { - return fmt.Errorf("failed to set controller reference on migration job: %w", err) - } - currentJob := &batchv1.Job{} err := params.Client.Get(ctx, types.NamespacedName{Name: desiredJob.Name, Namespace: desiredJob.Namespace}, currentJob) @@ -139,6 +150,34 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return err } + // Don't trust Status on a Job that has already been requested for + // deletion — its Succeeded/Failed counts still reflect the previous run. + // Background deletion removes the object immediately, but a webhook delay or + // brief informer lag can still deliver a stale cached object for a short window. + if currentJob.DeletionTimestamp != nil { + return fmt.Errorf("%w: migration job is being deleted, waiting", ErrMigrationPending) + } + + // Sanity-check the live Job against the desired spec before reading + // its Status. Re-adds the zero-container guard that existed in the previous + // implementation, and extends it with an image check. + // + // If the container count is zero the Job is malformed (manual edit, webhook + // interference) and must be replaced. If the image differs the Job carries a + // stale spec that somehow escaped hash-based detection (e.g., a field not yet + // covered by migrationSpecHash); replacing it is the safe fallback. + // + // In both cases: delete with Background propagation and return ErrMigrationPending + // so the next reconcile creates a fresh Job. + if len(currentJob.Spec.Template.Spec.Containers) == 0 || + currentJob.Spec.Template.Spec.Containers[0].Image != desiredJob.Spec.Template.Spec.Containers[0].Image { + params.Log.Info("replacing malformed or stale migration job", + "name", currentJob.Name, "namespace", currentJob.Namespace) + propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationBackground) + _ = client.IgnoreNotFound(params.Client.Delete(ctx, currentJob, propagationPolicy)) + return fmt.Errorf("%w: replacing malformed or stale migration job", ErrMigrationPending) + } + // Job succeeded: persist completion in status so future reconciles skip migration. // Because the controller owns the Job (Owns(&batchv1.Job{})), Job completion // triggers a reconcile event, which is how we reach this branch promptly. @@ -156,7 +195,7 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { // Only declare terminal failure when no pod attempt is still active. // With backoffLimit=1, Kubernetes briefly shows Active=1, Failed=1 while - // pod-2 (the retry) is running. Declaring failure then would kill a healthy + // pod-2 (the retry) is running. Declaring failure then would block a healthy // retry and require unnecessary manual intervention. if currentJob.Status.Active == 0 && currentJob.Status.Failed > 0 { params.Log.Info("migration job failed — Gateway deployment is blocked. Investigate logs then delete the job to retry", @@ -172,9 +211,17 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return fmt.Errorf("%w: migration job is active", ErrMigrationPending) } -// migrationSpecHash returns a 16-character hex hash of the migration-relevant spec -// fields: image, effective jdbcUrl, clearLocks flag, and activeDeadlineSeconds. -// When any of these change the hash changes and a fresh migration job is triggered. +// migrationSpecHash returns a 16-character hex hash of all Gateway spec fields +// that affect the Job built by NewMigrationJob. When any of these change, the +// hash changes, status is reset, and a fresh migration job is triggered. +// +// Fields hashed: +// - Image: new version = new Liquibase changesets to apply +// - effective jdbcUrl: different target DB requires its own migration run +// - clearLocks: changes the schema-update mode flag passed to the container +// - activeDeadlineSeconds: changing this is typically to fix a too-short timeout +// - DisklessConfig.Disabled: determines Secret-mounting strategy (envFrom vs Volume) +// - effective secretName: controls which Secret the Job container references func migrationSpecHash(gw *securityv1.Gateway) string { h := sha256.New() h.Write([]byte(gw.Spec.App.Image)) @@ -193,6 +240,18 @@ func migrationSpecHash(gw *securityv1.Gateway) string { h.Write([]byte(strconv.FormatInt(*gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds, 10))) } + // Include the two fields NewMigrationJob also reads but that were + // previously absent from the hash. Omitting them meant that rotating the + // Gateway Secret or flipping DisklessConfig after a completed migration would + // go undetected and the operator would keep trusting the old Complete: true status. + h.Write([]byte(strconv.FormatBool(gw.Spec.App.Management.DisklessConfig.Disabled))) + + secretName := gw.Name + if gw.Spec.App.Management.SecretName != "" { + secretName = gw.Spec.App.Management.SecretName + } + h.Write([]byte(secretName)) + return fmt.Sprintf("%x", h.Sum(nil))[:16] } From d895dc4de900e4238fe296e6a8ffce3377922bea Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Tue, 30 Jun 2026 22:56:35 -0700 Subject: [PATCH 07/12] Gate on migration status instead of the GatewayMigrationJob --- internal/controller/gateway/controller.go | 18 +- pkg/gateway/reconcile/deployment.go | 9 + pkg/gateway/reconcile/migration_job.go | 13 + pkg/gateway/reconcile/migration_job_test.go | 735 ++++++++++++++++++++ 4 files changed, 771 insertions(+), 4 deletions(-) create mode 100644 pkg/gateway/reconcile/migration_job_test.go diff --git a/internal/controller/gateway/controller.go b/internal/controller/gateway/controller.go index 60544014..928d0a59 100644 --- a/internal/controller/gateway/controller.go +++ b/internal/controller/gateway/controller.go @@ -133,14 +133,16 @@ func (r *GatewayReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } start := time.Now() + migrationPending := false for _, op := range ops { if err = r.runOp(ctx, params, op); err != nil { // ErrMigrationPending is a normal "not ready yet" state, not a failure. - // Requeue at a fixed short interval so the operator polls for job - // completion without applying exponential backoff or incrementing - // failure metrics. + // The Deployment op gates itself on Status.MigrationStatus, so the rest + // of the ops (ConfigMaps, Secrets, status, etc.) keep reconciling + // normally instead of being blocked behind the migration job. if errors.Is(err, reconcile.ErrMigrationPending) { - return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + migrationPending = true + continue } _ = captureMetrics(ctx, params, start, true, op.Name) return ctrl.Result{}, err @@ -149,6 +151,14 @@ func (r *GatewayReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct _ = captureMetrics(ctx, params, start, false, "") + // Poll at a fixed short interval while migration is pending, rather than + // applying exponential backoff or waiting for the normal 12-hour cadence. + // This is a safety net alongside the Owns(&batchv1.Job{}) watch, which + // already triggers a reconcile as soon as the Job's status changes. + if migrationPending { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + return ctrl.Result{RequeueAfter: 12 * time.Hour}, nil } diff --git a/pkg/gateway/reconcile/deployment.go b/pkg/gateway/reconcile/deployment.go index a0385e5c..4b416bbe 100644 --- a/pkg/gateway/reconcile/deployment.go +++ b/pkg/gateway/reconcile/deployment.go @@ -165,6 +165,15 @@ func applyOpenShiftSecurityDefaults(ctx context.Context, params Params) { } func Deployment(ctx context.Context, params Params) error { + // Gate on migration status rather than relying on GatewayMigrationJob to + // short-circuit the reconcile loop. This lets unrelated ops (ConfigMaps, + // Secrets, status, etc.) keep reconciling while a migration is in progress, + // and only the Deployment step itself waits for Status.MigrationStatus. + if !migrationComplete(params.Instance) { + params.Log.V(2).Info("migration job has not completed for the current spec, skipping deployment reconciliation") + return nil + } + // Auto-detect and set OpenShift UID/GID if on OpenShift platform // and user hasn't explicitly set RunAsUser if params.Platform == "openshift" { diff --git a/pkg/gateway/reconcile/migration_job.go b/pkg/gateway/reconcile/migration_job.go index 26f2956f..bbceca30 100644 --- a/pkg/gateway/reconcile/migration_job.go +++ b/pkg/gateway/reconcile/migration_job.go @@ -211,6 +211,19 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return fmt.Errorf("%w: migration job is active", ErrMigrationPending) } +// migrationComplete reports whether the pre-upgrade migration job (if enabled) +// has finished for the Gateway's current spec. The Deployment step calls this +// directly so that it — not the reconcile loop's control flow — is what decides +// whether the Deployment is created/updated. When +// migration is disabled there is nothing to wait for, so it reports complete. +func migrationComplete(gw *securityv1.Gateway) bool { + if !gw.Spec.App.Management.Database.MigrationJob.Enabled || !gw.Spec.App.Management.Database.Enabled { + return true + } + status := gw.Status.MigrationStatus + return status.Complete && status.SpecHash == migrationSpecHash(gw) +} + // migrationSpecHash returns a 16-character hex hash of all Gateway spec fields // that affect the Job built by NewMigrationJob. When any of these change, the // hash changes, status is reset, and a fresh migration job is triggered. diff --git a/pkg/gateway/reconcile/migration_job_test.go b/pkg/gateway/reconcile/migration_job_test.go new file mode 100644 index 00000000..d2a85290 --- /dev/null +++ b/pkg/gateway/reconcile/migration_job_test.go @@ -0,0 +1,735 @@ +/* +* Copyright (c) 2026 Broadcom. All rights reserved. +* The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. +* All trademarks, trade names, service marks, and logos referenced +* herein belong to their respective companies. +* +* This software and all information contained therein is confidential +* and proprietary and shall not be duplicated, used, disclosed or +* disseminated in any way except as authorized by the applicable +* license agreement, without the express written permission of Broadcom. +* All authorized reproductions must be marked with this language. +* +* EXCEPT AS SET FORTH IN THE APPLICABLE LICENSE AGREEMENT, TO THE +* EXTENT PERMITTED BY APPLICABLE LAW OR AS AGREED BY BROADCOM IN ITS +* APPLICABLE LICENSE AGREEMENT, BROADCOM PROVIDES THIS DOCUMENTATION +* "AS IS" WITHOUT WARRANTY OF ANY KIND, INCLUDING WITHOUT LIMITATION, +* ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +* PURPOSE, OR. NONINFRINGEMENT. IN NO EVENT WILL BROADCOM BE LIABLE TO +* THE END USER OR ANY THIRD PARTY FOR ANY LOSS OR DAMAGE, DIRECT OR +* INDIRECT, FROM THE USE OF THIS DOCUMENTATION, INCLUDING WITHOUT LIMITATION, +* LOST PROFITS, LOST INVESTMENT, BUSINESS INTERRUPTION, GOODWILL, OR +* LOST DATA, EVEN IF BROADCOM IS EXPRESSLY ADVISED IN ADVANCE OF THE +* POSSIBILITY OF SUCH LOSS OR DAMAGE. +* +* AI assistance has been used to generate some or all contents of this file. That includes, but is not limited to, new code, modifying existing code, stylistic edits. + */ +package reconcile + +import ( + "context" + "errors" + "testing" + + securityv1 "github.com/caapim/layer7-operator/api/v1" + "github.com/caapim/layer7-operator/pkg/gateway" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Test helpers +// ───────────────────────────────────────────────────────────────────────────── + +// migrationParams builds a Params value with database and migrationJob both +// enabled. Each test passes a unique name so objects do not collide in the +// shared in-process API server. +func migrationParams(name string) Params { + p := newParams() + p.Instance.Name = name + p.Instance.Spec.App.Management.Database.Enabled = true + p.Instance.Spec.App.Management.Database.MigrationJob.Enabled = true + return p +} + +// createGateway persists the Gateway CR in the in-process Kubernetes API server. +// controller-runtime's Create mutates params.Instance in-place with the +// server-assigned ResourceVersion, which is required by later Status().Update +// calls. A cleanup function deletes the CR when the test finishes. +func createGateway(t *testing.T, params Params) { + t.Helper() + if err := k8sClient.Create(ctx, params.Instance); err != nil { + t.Fatalf("createGateway: failed to create %s: %v", params.Instance.Name, err) + } + t.Cleanup(func() { + // ignore not-found errors — some tests delete the CR themselves + _ = k8sClient.Delete(context.Background(), params.Instance) + }) +} + +// seedMigrationStatus writes a pre-configured MigrationStatus into a Gateway CR +// that already exists in the cluster. Used to set up test preconditions (e.g. +// a previously-completed migration) without calling GatewayMigrationJob. +func seedMigrationStatus(t *testing.T, params Params, ms securityv1.MigrationStatus) { + t.Helper() + params.Instance.Status.MigrationStatus = ms + if err := k8sClient.Status().Update(ctx, params.Instance); err != nil { + t.Fatalf("seedMigrationStatus: %v", err) + } +} + +// buildOrphanJob constructs a minimal valid Job that can be registered in the +// in-process API server. It is used by tests that need an in-cluster Job +// independent of gateway.NewMigrationJob (e.g. for the disabled-cleanup and +// stale-image scenarios). +func buildOrphanJob(name, namespace, image string) *batchv1.Job { + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + Spec: batchv1.JobSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{{Name: "db-migration", Image: image}}, + }, + }, + }, + } +} + +// createJob persists a Job in the in-process API server. controller-runtime's +// Create mutates job in-place with the server-assigned ResourceVersion so that +// subsequent Status().Update calls succeed. A cleanup function deletes the Job. +func createJob(t *testing.T, job *batchv1.Job) { + t.Helper() + if err := k8sClient.Create(ctx, job); err != nil { + t.Fatalf("createJob %s: %v", job.Name, err) + } + t.Cleanup(func() { + _ = k8sClient.Delete(context.Background(), job) + }) +} + +// patchJobStatus updates a Job's status via the status subresource so tests can +// simulate Succeeded/Failed/Active counts without running a real container. +// The job argument must carry a valid ResourceVersion (i.e. returned from +// Create or Get before calling this function). +func patchJobStatus(t *testing.T, job *batchv1.Job, status batchv1.JobStatus) { + t.Helper() + job.Status = status + if err := k8sClient.Status().Update(ctx, job); err != nil { + t.Fatalf("patchJobStatus %s: %v", job.Name, err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// migrationSpecHash — pure function tests (no Kubernetes API required) +// ───────────────────────────────────────────────────────────────────────────── + +// TestMigrationSpecHash verifies that migrationSpecHash produces the correct +// output for each input. These tests cover all six fields that the function +// hashes, confirming that a change to any one of them produces a different hash +// and therefore a new migration job. +func TestMigrationSpecHash(t *testing.T) { + + t.Run("same spec always produces the same hash (determinism)", func(t *testing.T) { + // Input: identical Gateway spec, called twice. + // Output: both calls return the exact same 16-character hex string. + // Why: a non-deterministic hash would cause spurious job replacements. + p := newParams() + h1 := migrationSpecHash(p.Instance) + h2 := migrationSpecHash(p.Instance) + if h1 != h2 { + t.Errorf("hash is not deterministic: %q != %q", h1, h2) + } + if len(h1) != 16 { + t.Errorf("expected 16-char hash, got len=%d: %q", len(h1), h1) + } + }) + + t.Run("image change produces a different hash", func(t *testing.T) { + // Input: two Gateways identical except spec.app.image. + // Output: different hashes. + // Why: upgrading to a new Gateway image carries new Liquibase changesets + // that must be migrated before the Deployment proceeds. + p1 := newParams() + p1.Instance.Spec.App.Image = "gateway:11.1.1" + p2 := newParams() + p2.Instance.Spec.App.Image = "gateway:11.3.0" + if migrationSpecHash(p1.Instance) == migrationSpecHash(p2.Instance) { + t.Error("expected different hashes for different gateway images") + } + }) + + t.Run("migrationJob.jdbcUrl overrides database.jdbcUrl in the hash", func(t *testing.T) { + // Input A: database.jdbcUrl set; migrationJob.jdbcUrl not set. + // Input B: same database.jdbcUrl plus a different migrationJob.jdbcUrl. + // Output: different hashes. + // Why: migrationJob.jdbcUrl points the migration at a different database; + // the hash must capture the URL that the Job actually uses. + p1 := newParams() + p1.Instance.Spec.App.Management.Database.JDBCUrl = "jdbc:mysql://host-a/ssg" + + p2 := newParams() + p2.Instance.Spec.App.Management.Database.JDBCUrl = "jdbc:mysql://host-a/ssg" + p2.Instance.Spec.App.Management.Database.MigrationJob.JDBCUrl = "jdbc:mysql://host-b/ssg" + + if migrationSpecHash(p1.Instance) == migrationSpecHash(p2.Instance) { + t.Error("expected different hashes when migrationJob.jdbcUrl overrides database.jdbcUrl") + } + }) + + t.Run("clearLocks toggle produces a different hash", func(t *testing.T) { + // Input: two Gateways identical except migrationJob.clearLocks. + // Output: different hashes. + // Why: clearLocks changes the schema-update mode flag passed to the container + // (liquibase-only vs liquibase-only-with-unlock); the migration must be + // re-run with the new flag. + p1 := newParams() + p1.Instance.Spec.App.Management.Database.MigrationJob.ClearLocks = false + p2 := newParams() + p2.Instance.Spec.App.Management.Database.MigrationJob.ClearLocks = true + if migrationSpecHash(p1.Instance) == migrationSpecHash(p2.Instance) { + t.Error("expected different hashes for clearLocks=false vs clearLocks=true") + } + }) + + t.Run("activeDeadlineSeconds change produces a different hash", func(t *testing.T) { + // Input: one Gateway with activeDeadlineSeconds unset, one with 300. + // Output: different hashes. + // Why: changing the deadline (typically to fix a too-short timeout that caused + // a failure) should trigger a replacement job with the new deadline. + p1 := newParams() + p2 := newParams() + deadline := int64(300) + p2.Instance.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds = &deadline + if migrationSpecHash(p1.Instance) == migrationSpecHash(p2.Instance) { + t.Error("expected different hashes when activeDeadlineSeconds changes from nil to 300") + } + }) + + t.Run("DisklessConfig.Disabled toggle produces a different hash", func(t *testing.T) { + // Input: disklessConfig.disabled=false vs disklessConfig.disabled=true. + // Output: different hashes. + // Why: DisklessConfig.Disabled controls whether the Secret is mounted as + // env vars (diskless) or as a node.properties file (non-diskless). + // Flipping this after a completed migration changes the Job wiring + // entirely and must invalidate Status.Complete. + p1 := newParams() + p1.Instance.Spec.App.Management.DisklessConfig.Disabled = false + p2 := newParams() + p2.Instance.Spec.App.Management.DisklessConfig.Disabled = true + if migrationSpecHash(p1.Instance) == migrationSpecHash(p2.Instance) { + t.Error("expected different hashes for DisklessConfig.Disabled=false vs true") + } + }) + + t.Run("secretName change produces a different hash", func(t *testing.T) { + // Input: one Gateway using the default secret name (== gateway name), one + // with an explicit spec.app.management.secretName. + // Output: different hashes. + // Why: secretName determines which Kubernetes Secret the Job reads for + // database credentials. Rotating to a new Secret must invalidate the + // cached Complete=true status so the operator re-runs the migration + // with the new credentials. + p1 := newParams() + p1.Instance.Spec.App.Management.SecretName = "" + p2 := newParams() + p2.Instance.Spec.App.Management.SecretName = "my-custom-db-secret" + if migrationSpecHash(p1.Instance) == migrationSpecHash(p2.Instance) { + t.Error("expected different hashes when secretName changes") + } + }) +} + +// ───────────────────────────────────────────────────────────────────────────── +// MigrationJobName — pure function test (no Kubernetes API required) +// ───────────────────────────────────────────────────────────────────────────── + +// TestMigrationJobName confirms the canonical naming convention for migration +// Jobs. Centralising this in a helper (gateway.MigrationJobName) ensures the +// cleanup branch, the spec-change branch, and NewMigrationJob all agree on the +// Job name. +func TestMigrationJobName(t *testing.T) { + // Input: Gateway with name "my-gateway". + // Output: "my-gateway-db-migration". + p := newParams() + p.Instance.Name = "my-gateway" + got := gateway.MigrationJobName(p.Instance) + want := "my-gateway-db-migration" + if got != want { + t.Errorf("MigrationJobName: got %q, want %q", got, want) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// GatewayMigrationJob — integration tests using the in-process Kubernetes API +// +// The in-process API server (envtest) provides a real Kubernetes control plane +// without needing an external cluster. Jobs and Gateway CRs are persisted in +// etcd and retrieved via the same client the production code uses, giving +// high confidence that the reconcile logic behaves correctly in a real cluster. +// +// Because no Job controller is running, Job pod counts (Active, Succeeded, +// Failed) are set manually via the status subresource to simulate each +// lifecycle stage. +// ───────────────────────────────────────────────────────────────────────────── + +func TestGatewayMigrationJob(t *testing.T) { + + // ── disabled: migration feature off ────────────────────────────────────── + + t.Run("migration disabled: returns nil without creating a Job", func(t *testing.T) { + // Input: Gateway with migrationJob.enabled=false and database.enabled=true. + // Output: GatewayMigrationJob returns nil (no blocking, no work done). + // No Job object is created in the cluster. + p := newParams() + p.Instance.Name = "mj-disabled" + p.Instance.Spec.App.Management.Database.Enabled = true + p.Instance.Spec.App.Management.Database.MigrationJob.Enabled = false + + err := GatewayMigrationJob(ctx, p) + + if err != nil { + t.Fatalf("expected nil for disabled migration, got: %v", err) + } + job := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: gateway.MigrationJobName(p.Instance), Namespace: p.Instance.Namespace, + }, job) + if !k8serrors.IsNotFound(getErr) { + t.Errorf("expected no Job in cluster when migration disabled; get returned: %v", getErr) + } + }) + + t.Run("database disabled: returns nil without creating a Job", func(t *testing.T) { + // Input: migrationJob.enabled=true but database.enabled=false. + // (The webhook prevents this in practice; the reconciler guards defensively.) + // Output: GatewayMigrationJob returns nil and creates no Job. + p := newParams() + p.Instance.Name = "mj-db-disabled" + p.Instance.Spec.App.Management.Database.Enabled = false + p.Instance.Spec.App.Management.Database.MigrationJob.Enabled = true + + err := GatewayMigrationJob(ctx, p) + + if err != nil { + t.Fatalf("expected nil when database disabled, got: %v", err) + } + job := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: gateway.MigrationJobName(p.Instance), Namespace: p.Instance.Namespace, + }, job) + if !k8serrors.IsNotFound(getErr) { + t.Errorf("expected no Job in cluster when database disabled; get returned: %v", getErr) + } + }) + + // ── first enable ───────────────────────────────────────────────────────── + + t.Run("first enable: Job created, SpecHash written, ErrMigrationPending returned", func(t *testing.T) { + // Input: Gateway with migration freshly enabled; empty Status.MigrationStatus; + // no existing Job in the cluster. + // Output: + // - GatewayMigrationJob returns an error wrapping ErrMigrationPending. + // (ErrMigrationPending is a sentinel: "not done yet, check back in 10s". + // It is not a real failure — the controller requeues without backoff.) + // - A Job named "-db-migration" exists in the cluster. + // - Status.MigrationStatus.SpecHash is non-empty (hash persisted to etcd). + // - Status.MigrationStatus.Complete is false (migration not yet finished). + p := migrationParams("mj-first-enable") + createGateway(t, p) + + err := GatewayMigrationJob(ctx, p) + + if !errors.Is(err, ErrMigrationPending) { + t.Fatalf("expected ErrMigrationPending on first enable, got: %v", err) + } + + // Verify the Job was created. + job := &batchv1.Job{} + if getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: gateway.MigrationJobName(p.Instance), Namespace: p.Instance.Namespace, + }, job); getErr != nil { + t.Fatalf("expected Job to exist after first enable, got: %v", getErr) + } + + // Verify Status.MigrationStatus was written. + if p.Instance.Status.MigrationStatus.SpecHash == "" { + t.Error("expected SpecHash to be set in status after first enable") + } + if p.Instance.Status.MigrationStatus.Complete { + t.Error("expected Complete=false — migration job was just created, not yet finished") + } + }) + + // ── fast path ──────────────────────────────────────────────────────────── + + t.Run("fast path: Complete=true for current spec returns nil without touching the cluster", func(t *testing.T) { + // Input: Gateway where Status.MigrationStatus already records Complete=true + // for the exact current spec hash. + // Output: GatewayMigrationJob returns nil immediately. + // No Job is created or modified (the 12-hour reconcile no-op path). + // Why this matters: without this fast path, the 12-hour requeue would create + // a new Job every time the completed Job was GC'd by Kubernetes. + p := migrationParams("mj-fast-path") + createGateway(t, p) + + // Pre-seed status to simulate a previously completed migration. + desiredHash := migrationSpecHash(p.Instance) + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: desiredHash, + Complete: true, + }) + + err := GatewayMigrationJob(ctx, p) + + if err != nil { + t.Fatalf("expected nil from fast path (Complete=true), got: %v", err) + } + + // Verify no Job was created. + job := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: gateway.MigrationJobName(p.Instance), Namespace: p.Instance.Namespace, + }, job) + if !k8serrors.IsNotFound(getErr) { + t.Errorf("fast path must not create a Job; found one in cluster") + } + }) + + // ── job lifecycle: running ──────────────────────────────────────────────── + + t.Run("active job: returns ErrMigrationPending without modifying the Job", func(t *testing.T) { + // Input: Gateway with an existing Job that reports Active=1 (pod is running). + // Status.MigrationStatus.SpecHash matches the current desired hash. + // Output: GatewayMigrationJob returns ErrMigrationPending. + // The existing Job is neither deleted nor recreated. + p := migrationParams("mj-active-job") + createGateway(t, p) + + // Create the Job directly so we control its spec and status independently. + job := gateway.NewMigrationJob(p.Instance) + createJob(t, job) + // Simulate a running pod: patch Job status to Active=1. + patchJobStatus(t, job, batchv1.JobStatus{Active: 1}) + + // Pre-seed status with the current hash (as the first-enable path would have). + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: migrationSpecHash(p.Instance), + }) + + err := GatewayMigrationJob(ctx, p) + + if !errors.Is(err, ErrMigrationPending) { + t.Fatalf("expected ErrMigrationPending while job is active, got: %v", err) + } + + // Job must still exist and be unmodified. + got := &batchv1.Job{} + if getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: job.Name, Namespace: job.Namespace, + }, got); getErr != nil { + t.Fatalf("Job must still exist while active: %v", getErr) + } + }) + + // ── job lifecycle: succeeded ────────────────────────────────────────────── + + t.Run("succeeded job: writes Complete=true to status and returns nil", func(t *testing.T) { + // Input: Gateway whose existing Job reports Succeeded=1 (all changesets applied). + // Output: + // - GatewayMigrationJob returns nil (Deployment step is now unblocked). + // - Status.MigrationStatus.Complete is true (persisted to etcd). + // - Status.MigrationStatus.SpecHash still matches the current spec hash. + // Why: the Complete=true flag is what allows all future 12-hour reconciles + // to skip migration management entirely via the fast path. + p := migrationParams("mj-succeeded") + createGateway(t, p) + + // First call: creates the Job and writes SpecHash to status. + if firstErr := GatewayMigrationJob(ctx, p); !errors.Is(firstErr, ErrMigrationPending) { + t.Fatalf("first call should return ErrMigrationPending, got: %v", firstErr) + } + + // Retrieve the created Job and simulate success by patching its status. + job := &batchv1.Job{} + if getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: gateway.MigrationJobName(p.Instance), Namespace: p.Instance.Namespace, + }, job); getErr != nil { + t.Fatalf("Job must exist after first call: %v", getErr) + } + patchJobStatus(t, job, batchv1.JobStatus{Succeeded: 1}) + + // Second call: Job is succeeded — should write Complete=true and return nil. + err := GatewayMigrationJob(ctx, p) + + if err != nil { + t.Fatalf("expected nil after Job succeeded, got: %v", err) + } + if !p.Instance.Status.MigrationStatus.Complete { + t.Error("expected Status.MigrationStatus.Complete=true after Job succeeded") + } + wantHash := migrationSpecHash(p.Instance) + if p.Instance.Status.MigrationStatus.SpecHash != wantHash { + t.Errorf("expected SpecHash=%q, got %q", wantHash, p.Instance.Status.MigrationStatus.SpecHash) + } + }) + + // ── job lifecycle: failed ───────────────────────────────────────────────── + + t.Run("failed job: returns a blocking error (not ErrMigrationPending)", func(t *testing.T) { + // Input: Gateway whose existing Job reports Active=0, Failed=1 + // (both pod attempts exhausted; backoffLimit=1 reached). + // Output: GatewayMigrationJob returns a non-nil error that does NOT wrap + // ErrMigrationPending. The controller applies backoff and metrics. + // The Job is NOT deleted — the user must fix the root cause + // (restore DB to pre-upgrade state if partially migrated, fix + // credentials, etc.) and then delete the Job manually to retry. + p := migrationParams("mj-failed") + createGateway(t, p) + + job := gateway.NewMigrationJob(p.Instance) + createJob(t, job) + patchJobStatus(t, job, batchv1.JobStatus{Active: 0, Failed: 1}) + + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: migrationSpecHash(p.Instance), + }) + + err := GatewayMigrationJob(ctx, p) + + // Must be a real error (not the "pending" sentinel). + if err == nil { + t.Fatal("expected a blocking error for failed job, got nil") + } + if errors.Is(err, ErrMigrationPending) { + t.Fatalf("expected a real blocking error, not ErrMigrationPending — got: %v", err) + } + + // Job must still exist (user must delete it manually to retry). + got := &batchv1.Job{} + if getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: job.Name, Namespace: job.Namespace, + }, got); getErr != nil { + t.Fatalf("failed Job must not be auto-deleted (manual intervention required): %v", getErr) + } + }) + + // ── spec change ─────────────────────────────────────────────────────────── + + t.Run("spec changed: old Job deleted, status reset to new hash, ErrMigrationPending returned", func(t *testing.T) { + // Input: Gateway previously migrated with image v1 (Complete=true, SpecHash=H1). + // User changes spec.app.image to v2, making desiredHash=H2 != H1. + // Output: + // - The old Job is deleted from the cluster. + // - Status.MigrationStatus is reset to {SpecHash: H2, Complete: false}. + // - GatewayMigrationJob returns ErrMigrationPending so the controller + // requeues in 10s, at which point it will create the new Job. + // Why the delete-before-status-write ordering matters: if the status were + // written first and the deletion then failed, the new hash would already be + // persisted, making this branch unreachable on the next reconcile — a silent, + // permanent skip of the required migration. + p := migrationParams("mj-spec-change") + p.Instance.Spec.App.Image = "gateway:11.1.1" + createGateway(t, p) + + hashV1 := migrationSpecHash(p.Instance) + + // Seed status as if migration with image v1 already completed. + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: hashV1, + Complete: true, + }) + + // Create an old Job representing the completed v1 migration run. + oldJob := gateway.NewMigrationJob(p.Instance) + createJob(t, oldJob) + + // Simulate an image upgrade — this changes the desired spec hash. + p.Instance.Spec.App.Image = "gateway:11.3.0" + + err := GatewayMigrationJob(ctx, p) + + if !errors.Is(err, ErrMigrationPending) { + t.Fatalf("expected ErrMigrationPending after spec change, got: %v", err) + } + + // Old Job must be gone (Background deletion removes it from etcd immediately). + oldJobCheck := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: oldJob.Name, Namespace: oldJob.Namespace, + }, oldJobCheck) + if !k8serrors.IsNotFound(getErr) { + t.Errorf("expected old Job to be deleted after spec change; still exists") + } + + // Status must be reset to the new hash with Complete=false. + hashV2 := migrationSpecHash(p.Instance) + if p.Instance.Status.MigrationStatus.SpecHash != hashV2 { + t.Errorf("SpecHash: got %q, want %q", p.Instance.Status.MigrationStatus.SpecHash, hashV2) + } + if p.Instance.Status.MigrationStatus.Complete { + t.Error("Complete must be false after spec change — migration not yet run for new spec") + } + }) + + // ── disabled cleanup ────────────────────────────────────────────────────── + + t.Run("disabled cleanup: orphaned Job is deleted when migration toggled off", func(t *testing.T) { + // Input: A Job already exists in the cluster (left from a previous run). + // User sets migrationJob.enabled=false. + // Output: GatewayMigrationJob returns nil and the orphaned Job is deleted. + // Why: toggling migration off should not leave stale Job objects in the + // namespace that could confuse operators or trigger alert rules. + p := newParams() + p.Instance.Name = "mj-disabled-cleanup" + p.Instance.Spec.App.Management.Database.Enabled = true + + // Create the orphaned Job directly — simulates a job left behind from a + // previous enabled run using the same Gateway name. + orphanJob := buildOrphanJob( + gateway.MigrationJobName(p.Instance), + p.Instance.Namespace, + "gateway:11.1.1", + ) + createJob(t, orphanJob) + + // Now disable migration. + p.Instance.Spec.App.Management.Database.MigrationJob.Enabled = false + + err := GatewayMigrationJob(ctx, p) + + if err != nil { + t.Fatalf("expected nil when migration disabled, got: %v", err) + } + got := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: orphanJob.Name, Namespace: orphanJob.Namespace, + }, got) + if !k8serrors.IsNotFound(getErr) { + t.Errorf("expected orphaned Job to be deleted when migration disabled; still exists") + } + }) + + // ── stale / malformed job guard ─────────────────────────────────────────── + + t.Run("stale image job: Job with wrong image is replaced, ErrMigrationPending returned", func(t *testing.T) { + // Input: An existing Job was built with image "old-image:1.0" but the + // current Gateway spec still carries the same spec hash (the image + // change somehow escaped hash detection). The live Job's image does + // not match the image in the freshly-built desired Job. + // Output: The stale Job is deleted and ErrMigrationPending is returned so + // the next reconcile creates a valid replacement. + // This guard also covers the zero-container case (a Job whose containers + // were cleared by a manual edit or a mutating webhook after creation — + // something the API server admission webhook would not catch post-creation). + p := migrationParams("mj-stale-image") + createGateway(t, p) + + // Create a Job with a deliberately wrong image. + staleJob := buildOrphanJob( + gateway.MigrationJobName(p.Instance), + p.Instance.Namespace, + "old-image:1.0", // does not match p.Instance.Spec.App.Image ("test") + ) + createJob(t, staleJob) + + // Pre-seed status with the current hash so we reach the live-object check. + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: migrationSpecHash(p.Instance), + }) + + err := GatewayMigrationJob(ctx, p) + + if !errors.Is(err, ErrMigrationPending) { + t.Fatalf("expected ErrMigrationPending when replacing stale-image job, got: %v", err) + } + got := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: staleJob.Name, Namespace: staleJob.Namespace, + }, got) + if !k8serrors.IsNotFound(getErr) { + t.Errorf("expected stale Job to be deleted; still exists") + } + }) + + // ── DeletionTimestamp guard ─────────────────────────────────────────────── + + t.Run("DeletionTimestamp guard: does not read stale status from a mid-deletion Job", func(t *testing.T) { + // Input: An existing Job has DeletionTimestamp set (Kubernetes accepted the + // delete request but the object is still visible because a finalizer + // is preventing immediate removal). The Job's status still shows + // Succeeded=1 from the previous run — stale data. + // Output: GatewayMigrationJob returns ErrMigrationPending and does NOT write + // Complete=true to Status.MigrationStatus. + // Why: Background deletion removes the Job from etcd immediately, but a + // brief informer-cache lag or webhook delay can deliver a stale cached + // object. Without the DeletionTimestamp guard in GatewayMigrationJob, + // the operator would trust the old Succeeded count and falsely mark + // the new spec as migrated. + p := migrationParams("mj-deletion-ts") + createGateway(t, p) + + // ── Setup: engineer a Job that has DeletionTimestamp set but Succeeded=1 ── + // + // Add a finalizer so that Delete() sets DeletionTimestamp without immediately + // removing the object — simulating the mid-teardown window. + job := gateway.NewMigrationJob(p.Instance) + job.Finalizers = []string{"test/keep-alive"} + createJob(t, job) + + // Patch Succeeded=1 so the Job looks like it completed a previous run. + // This is the stale status the guard must ignore. + patchJobStatus(t, job, batchv1.JobStatus{Succeeded: 1}) + + // Delete the Job — Kubernetes sets DeletionTimestamp but the finalizer + // keeps the object alive, so GatewayMigrationJob's Get still returns it. + if delErr := k8sClient.Delete(ctx, job); delErr != nil { + t.Fatalf("failed to delete Job to set DeletionTimestamp: %v", delErr) + } + + // Pre-seed the spec hash so the reconcile reaches the live-job inspection + // (not short-circuited by the fast path or spec-change branches). + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: migrationSpecHash(p.Instance), + }) + + // ── Exercise ────────────────────────────────────────────────────────────── + // + // This simulates reconcile #2 after a spec change. The operator deleted the + // old Job in reconcile #1 (Background propagation), but due to an + // informer-cache lag the Get here still returns the old object — now with + // DeletionTimestamp set and a stale Succeeded=1 from the previous run. + // GatewayMigrationJob must detect this and not trust the stale status. + err := GatewayMigrationJob(ctx, p) + + // ── Assert: the DeletionTimestamp guard fires ───────────────────────────── + // + // The guard is the check inside GatewayMigrationJob that inspects + // currentJob.DeletionTimestamp after the Get. It must return + // ErrMigrationPending instead of falling through to the Succeeded check. + if !errors.Is(err, ErrMigrationPending) { + t.Fatalf("DeletionTimestamp guard must return ErrMigrationPending, got: %v", err) + } + + // The guard must also prevent setMigrationStatus from being called with + // Complete=true — the stale Succeeded=1 count must not be trusted. + if p.Instance.Status.MigrationStatus.Complete { + t.Error("DeletionTimestamp guard must not write Complete=true from a mid-deletion Job") + } + + // Remove the finalizer so the Job is actually deleted when the test ends. + t.Cleanup(func() { + j := &batchv1.Job{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{ + Name: job.Name, Namespace: job.Namespace, + }, j); err == nil { + j.Finalizers = nil + _ = k8sClient.Update(context.Background(), j) + } + }) + }) +} From 3842db23c093d9d62f91f7a30c5f03e1eca7fd97 Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Tue, 30 Jun 2026 23:03:20 -0700 Subject: [PATCH 08/12] Remove extra comments. --- pkg/gateway/migration_job.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/gateway/migration_job.go b/pkg/gateway/migration_job.go index 211a132c..80d42b51 100644 --- a/pkg/gateway/migration_job.go +++ b/pkg/gateway/migration_job.go @@ -115,7 +115,7 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { }) } } else { - // Fix #5: optional must be false to match the main Gateway deployment. + // optional must be false to match the main Gateway deployment. // With optional: true, Kubernetes schedules the pod even when the Secret // is missing, causing a silent empty mount and a confusing DB connection // error instead of a clear "secret not found" Kubernetes event. @@ -154,7 +154,7 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { activeDeadlineSeconds = *gw.Spec.App.Management.Database.MigrationJob.ActiveDeadlineSeconds } - // Fix #9: apply the same ServiceAccount fallback logic as the main deployment + // apply the same ServiceAccount fallback logic as the main deployment // so the migration pod runs under the same identity as Gateway pods. // Without this, pods default to the namespace "default" service account when // spec.app.serviceAccount.name is not explicitly set, which can fail in clusters From 323a6e61a53fbea54d902ee92d1e118b1d9469c4 Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Tue, 30 Jun 2026 23:23:06 -0700 Subject: [PATCH 09/12] Fix bug. --- pkg/gateway/reconcile/migration_job_test.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/gateway/reconcile/migration_job_test.go b/pkg/gateway/reconcile/migration_job_test.go index d2a85290..8f0e4fde 100644 --- a/pkg/gateway/reconcile/migration_job_test.go +++ b/pkg/gateway/reconcile/migration_job_test.go @@ -548,8 +548,17 @@ func TestGatewayMigrationJob(t *testing.T) { oldJob := gateway.NewMigrationJob(p.Instance) createJob(t, oldJob) - // Simulate an image upgrade — this changes the desired spec hash. + // Simulate an image upgrade and persist it, mirroring a real "kubectl + // apply": the reconciler only ever sees a spec change once it's already + // stored server-side. Without this Update, setMigrationStatus's + // Status().Update() call round-trips through the API server and + // overwrites our in-memory (never-persisted) Image field with the + // original "11.1.1", making the post-call hash recompute below use the + // wrong spec. p.Instance.Spec.App.Image = "gateway:11.3.0" + if err := k8sClient.Update(ctx, p.Instance); err != nil { + t.Fatalf("failed to persist image upgrade: %v", err) + } err := GatewayMigrationJob(ctx, p) From 929890a415fcfbef0fff6eec2f9ae82fe462ae7a Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Thu, 2 Jul 2026 15:07:02 -0700 Subject: [PATCH 10/12] Added control-plane related End2End tests for the database migration. --- .../00-assert.yaml | 35 ++++++++++++ .../00-install.yaml | 45 +++++++++++++++ .../gateway-migration-disabled/00-assert.yaml | 34 ++++++++++++ .../00-install.yaml | 55 +++++++++++++++++++ .../gateway-migration-disabled/01-assert.yaml | 12 ++++ .../01-install.yaml | 39 +++++++++++++ .../gateway-migration-disabled/02-assert.yaml | 30 ++++++++++ .../02-install.yaml | 39 +++++++++++++ .../gateway-migration-jdbcurl/00-assert.yaml | 40 ++++++++++++++ .../gateway-migration-jdbcurl/00-install.yaml | 48 ++++++++++++++++ .../e2e/gateway-migration-job/00-assert.yaml | 50 +++++++++++++++++ .../e2e/gateway-migration-job/00-install.yaml | 45 +++++++++++++++ .../e2e/gateway-migration-job/01-assert.yaml | 45 +++++++++++++++ 13 files changed, 517 insertions(+) create mode 100644 tests/e2e/gateway-migration-clearlocks/00-assert.yaml create mode 100644 tests/e2e/gateway-migration-clearlocks/00-install.yaml create mode 100644 tests/e2e/gateway-migration-disabled/00-assert.yaml create mode 100644 tests/e2e/gateway-migration-disabled/00-install.yaml create mode 100644 tests/e2e/gateway-migration-disabled/01-assert.yaml create mode 100644 tests/e2e/gateway-migration-disabled/01-install.yaml create mode 100644 tests/e2e/gateway-migration-disabled/02-assert.yaml create mode 100644 tests/e2e/gateway-migration-disabled/02-install.yaml create mode 100644 tests/e2e/gateway-migration-jdbcurl/00-assert.yaml create mode 100644 tests/e2e/gateway-migration-jdbcurl/00-install.yaml create mode 100644 tests/e2e/gateway-migration-job/00-assert.yaml create mode 100644 tests/e2e/gateway-migration-job/00-install.yaml create mode 100644 tests/e2e/gateway-migration-job/01-assert.yaml diff --git a/tests/e2e/gateway-migration-clearlocks/00-assert.yaml b/tests/e2e/gateway-migration-clearlocks/00-assert.yaml new file mode 100644 index 00000000..c9fb3081 --- /dev/null +++ b/tests/e2e/gateway-migration-clearlocks/00-assert.yaml @@ -0,0 +1,35 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 60 +--- +# Assert: when clearLocks=true, EXTRA_JAVA_ARGS uses "liquibase-only-with-unlock" +# instead of the default "liquibase-only". +# +# The distinction matters: "liquibase-only-with-unlock" calls +# LiquibaseUtil.forceReleaseLocks() +# before applying changesets. Without this flag a stuck +# DATABASECHANGELOGLOCK causes the migration Job to hang until +# activeDeadlineSeconds is reached, blocking the Gateway upgrade. +apiVersion: batch/v1 +kind: Job +metadata: + name: ssg-migration-clearlocks-db-migration +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + containers: + - name: db-migration + image: docker.io/caapim/gateway:11.2.1 + envFrom: + - configMapRef: + name: ssg-migration-clearlocks + - secretRef: + name: ssg-migration-clearlocks + env: + - name: EXTRA_JAVA_ARGS + # liquibase-only-with-unlock — lock is forcibly released before changesets run. + # Contrast with the gateway-migration-job test which asserts "liquibase-only" + # (no lock release) when clearLocks=false. + value: "-Dgateway.db.schema-update.mode=liquibase-only-with-unlock" diff --git a/tests/e2e/gateway-migration-clearlocks/00-install.yaml b/tests/e2e/gateway-migration-clearlocks/00-install.yaml new file mode 100644 index 00000000..33ccb43b --- /dev/null +++ b/tests/e2e/gateway-migration-clearlocks/00-install.yaml @@ -0,0 +1,45 @@ +# Test: clearLocks=true — Job uses liquibase-only-with-unlock mode +# +# Scenario: a previous migration was interrupted and left the Liquibase +# DATABASECHANGELOGLOCK table locked. The operator re-running normal +# liquibase-only mode would stall immediately waiting for the lock. Setting +# clearLocks=true makes the migration Job forcibly release the lock before +# applying changesets, unblocking the upgrade. +# +# Expected behaviour: +# EXTRA_JAVA_ARGS is set to "-Dgateway.db.schema-update.mode=liquibase-only-with-unlock" +# instead of the default "liquibase-only". +apiVersion: security.brcmlabs.com/v1 +kind: Gateway +metadata: + name: ssg-migration-clearlocks +spec: + version: "11.2.1" + license: + accept: true + secretName: gateway-license + app: + replicas: 1 + image: docker.io/caapim/gateway:11.2.1 + imagePullPolicy: IfNotPresent + management: + cluster: + hostname: gateway.brcmlabs.com + username: admin + password: 7layer + database: + enabled: true + jdbcUrl: jdbc:mysql://no-mysql.local:3306/ssg + migrationJob: + enabled: true + clearLocks: true # ← force lock release before running changesets + service: + enabled: true + type: ClusterIP + ports: + - name: https + port: 8443 + targetPort: 8443 + - name: management + port: 9443 + targetPort: 9443 diff --git a/tests/e2e/gateway-migration-disabled/00-assert.yaml b/tests/e2e/gateway-migration-disabled/00-assert.yaml new file mode 100644 index 00000000..ad7601da --- /dev/null +++ b/tests/e2e/gateway-migration-disabled/00-assert.yaml @@ -0,0 +1,34 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +# 60 s is enough for the operator to reconcile and create the Deployment. +# We only assert the Deployment spec (not pod readiness), so we do not need +# to wait for the Gateway JVM to start against the fake database. +timeout: 60 +commands: + - script: | + # Assert: no migration Job was ever created. YAML subset matching can + # only assert what exists — it cannot express "this resource is + # absent" — so a command script is required here. Without this check, + # a regression that created a Job despite migrationJob.enabled=false + # would pass this test silently. + if kubectl get job ssg-migration-disabled-db-migration -n "$NAMESPACE" >/dev/null 2>&1; then + echo "FAIL: Job must not exist when migrationJob.enabled=false" + exit 1 + fi + + echo "PASS: no migration Job exists" +--- +# Assert: the Deployment was created with the correct image. +# The key observable here is that the Deployment EXISTS at all — if migration +# were enabled and blocking the rollout, the operator would return +# ErrMigrationPending and never reach the Deployment reconcile step. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ssg-migration-disabled +spec: + template: + spec: + containers: + - name: gateway + image: docker.io/caapim/gateway:11.2.1 diff --git a/tests/e2e/gateway-migration-disabled/00-install.yaml b/tests/e2e/gateway-migration-disabled/00-install.yaml new file mode 100644 index 00000000..ce8263e6 --- /dev/null +++ b/tests/e2e/gateway-migration-disabled/00-install.yaml @@ -0,0 +1,55 @@ +# Test: migration disabled — Deployment must be created without a Job, +# and toggling migrationJob off after it was on must clean up the Job. +# +# This is a 3-step test: +# 00: migrationJob.enabled=false from the very first apply. +# -> Deployment created, no Job ever exists. +# 01: migrationJob.enabled=true. +# -> Job created (exercises the enable path so step 02 has something +# real to clean up, rather than a no-op delete of a Job that was +# never there). +# 02: migrationJob.enabled=false again. +# -> Job is deleted (the orphan-cleanup path in +# GatewayMigrationJob's "!migrationEnabled" branch) and the +# Deployment keeps reconciling normally. +# +# Scenario: database.enabled=true so the operator has a full database config, +# but migrationJob.enabled=false. The operator must skip migration entirely +# (deleting any stale Job if one exists) and proceed to create the Deployment +# without waiting for any migration to complete. +# +# The fake jdbcUrl (no-mysql.local) means the Gateway pods will fail to start, +# but this test only asserts the Deployment/Job object specs — not pod readiness. +apiVersion: security.brcmlabs.com/v1 +kind: Gateway +metadata: + name: ssg-migration-disabled +spec: + version: "11.2.1" + license: + accept: true + secretName: gateway-license + app: + replicas: 1 + image: docker.io/caapim/gateway:11.2.1 + imagePullPolicy: IfNotPresent + management: + cluster: + hostname: gateway.brcmlabs.com + username: admin + password: 7layer + database: + enabled: true + jdbcUrl: jdbc:mysql://no-mysql.local:3306/ssg + migrationJob: + enabled: false # ← migration explicitly off + service: + enabled: true + type: ClusterIP + ports: + - name: https + port: 8443 + targetPort: 8443 + - name: management + port: 9443 + targetPort: 9443 diff --git a/tests/e2e/gateway-migration-disabled/01-assert.yaml b/tests/e2e/gateway-migration-disabled/01-assert.yaml new file mode 100644 index 00000000..a757305c --- /dev/null +++ b/tests/e2e/gateway-migration-disabled/01-assert.yaml @@ -0,0 +1,12 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 60 +--- +# Assert: enabling migrationJob creates the Job. Only the name/existence is +# checked here — the Job's spec fields are already covered in detail by +# tests/e2e/gateway-migration-job. This step only needs a real Job to exist +# so step 02 can prove it gets cleaned up. +apiVersion: batch/v1 +kind: Job +metadata: + name: ssg-migration-disabled-db-migration diff --git a/tests/e2e/gateway-migration-disabled/01-install.yaml b/tests/e2e/gateway-migration-disabled/01-install.yaml new file mode 100644 index 00000000..ca8a25b4 --- /dev/null +++ b/tests/e2e/gateway-migration-disabled/01-install.yaml @@ -0,0 +1,39 @@ +# Step 01: flip migrationJob.enabled to true on the same Gateway. +# +# This exists purely to give step 02 a real Job to clean up — testing the +# disable path against a Gateway that never had a Job (step 00) does not +# exercise GatewayMigrationJob's "!migrationEnabled" orphan-cleanup branch +# at all, since there is nothing to delete. +apiVersion: security.brcmlabs.com/v1 +kind: Gateway +metadata: + name: ssg-migration-disabled +spec: + version: "11.2.1" + license: + accept: true + secretName: gateway-license + app: + replicas: 1 + image: docker.io/caapim/gateway:11.2.1 + imagePullPolicy: IfNotPresent + management: + cluster: + hostname: gateway.brcmlabs.com + username: admin + password: 7layer + database: + enabled: true + jdbcUrl: jdbc:mysql://no-mysql.local:3306/ssg + migrationJob: + enabled: true # ← toggled on + service: + enabled: true + type: ClusterIP + ports: + - name: https + port: 8443 + targetPort: 8443 + - name: management + port: 9443 + targetPort: 9443 diff --git a/tests/e2e/gateway-migration-disabled/02-assert.yaml b/tests/e2e/gateway-migration-disabled/02-assert.yaml new file mode 100644 index 00000000..ca03f957 --- /dev/null +++ b/tests/e2e/gateway-migration-disabled/02-assert.yaml @@ -0,0 +1,30 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 60 +commands: + - script: | + # Assert: the Job created in step 01 has been deleted now that + # migrationJob.enabled=false. Without this check, a regression that + # left the Job orphaned when migration is disabled after having been + # enabled would pass silently — this is the exact scenario the + # "!migrationEnabled" cleanup branch in GatewayMigrationJob exists for. + if kubectl get job ssg-migration-disabled-db-migration -n "$NAMESPACE" >/dev/null 2>&1; then + echo "FAIL: Job should have been deleted after migrationJob.enabled was toggled back to false" + exit 1 + fi + + echo "PASS: orphaned Job was cleaned up" +--- +# Assert: the Deployment keeps reconciling normally after the toggle — +# disabling migration does not disturb the Deployment already created in +# step 00. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ssg-migration-disabled +spec: + template: + spec: + containers: + - name: gateway + image: docker.io/caapim/gateway:11.2.1 diff --git a/tests/e2e/gateway-migration-disabled/02-install.yaml b/tests/e2e/gateway-migration-disabled/02-install.yaml new file mode 100644 index 00000000..181be089 --- /dev/null +++ b/tests/e2e/gateway-migration-disabled/02-install.yaml @@ -0,0 +1,39 @@ +# Step 02: flip migrationJob.enabled back to false. +# +# This exercises GatewayMigrationJob's orphan-cleanup branch +# (pkg/gateway/reconcile/migration_job.go: "!migrationEnabled") against a +# Job that actually exists (created in step 01) — the scenario the branch's +# comment describes: "toggling enabled→false doesn't orphan a job". +apiVersion: security.brcmlabs.com/v1 +kind: Gateway +metadata: + name: ssg-migration-disabled +spec: + version: "11.2.1" + license: + accept: true + secretName: gateway-license + app: + replicas: 1 + image: docker.io/caapim/gateway:11.2.1 + imagePullPolicy: IfNotPresent + management: + cluster: + hostname: gateway.brcmlabs.com + username: admin + password: 7layer + database: + enabled: true + jdbcUrl: jdbc:mysql://no-mysql.local:3306/ssg + migrationJob: + enabled: false # ← toggled back off + service: + enabled: true + type: ClusterIP + ports: + - name: https + port: 8443 + targetPort: 8443 + - name: management + port: 9443 + targetPort: 9443 diff --git a/tests/e2e/gateway-migration-jdbcurl/00-assert.yaml b/tests/e2e/gateway-migration-jdbcurl/00-assert.yaml new file mode 100644 index 00000000..e0d38ed9 --- /dev/null +++ b/tests/e2e/gateway-migration-jdbcurl/00-assert.yaml @@ -0,0 +1,40 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 60 +--- +# Assert: when migrationJob.jdbcUrl is set, the Job container has +# SSG_DATABASE_JDBC_URL in its env, overriding the main database.jdbcUrl. +# +# Key fields verified: +# env[EXTRA_JAVA_ARGS] — liquibase-only (clearLocks=false, the default) +# env[SSG_DATABASE_JDBC_URL] — the override URL with "?createDatabaseIfNotExist=false" +# appended. The suffix ensures a typo in the URL +# fails fast instead of silently creating a new DB. +# +# The proxy URL (jdbc:mysql://no-mysql-proxy.local:3306/ssg) must NOT appear in +# the Job's SSG_DATABASE_JDBC_URL — only the direct primary writer URL should. +apiVersion: batch/v1 +kind: Job +metadata: + name: ssg-migration-jdbcurl-db-migration +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + containers: + - name: db-migration + image: docker.io/caapim/gateway:11.2.1 + envFrom: + - configMapRef: + name: ssg-migration-jdbcurl + - secretRef: + name: ssg-migration-jdbcurl + env: + - name: EXTRA_JAVA_ARGS + value: "-Dgateway.db.schema-update.mode=liquibase-only" + - name: SSG_DATABASE_JDBC_URL + # The override URL with createDatabaseIfNotExist=false appended. + # NewMigrationJob adds this suffix when the URL has no '?' yet, + # so "jdbc:mysql://no-mysql-primary.local:3306/ssg" becomes this value. + value: "jdbc:mysql://no-mysql-primary.local:3306/ssg?createDatabaseIfNotExist=false" diff --git a/tests/e2e/gateway-migration-jdbcurl/00-install.yaml b/tests/e2e/gateway-migration-jdbcurl/00-install.yaml new file mode 100644 index 00000000..1a2bfc06 --- /dev/null +++ b/tests/e2e/gateway-migration-jdbcurl/00-install.yaml @@ -0,0 +1,48 @@ +# Test: migrationJob.jdbcUrl override — Job gets SSG_DATABASE_JDBC_URL env var +# +# Scenario: database.jdbcUrl points at a load-balancing proxy (common with AWS RDS +# Proxy or ProxySQL). Liquibase requires a direct connection to the primary writer +# to avoid multiplexing issues, so migrationJob.jdbcUrl overrides the URL for the +# migration Job only while normal Gateway pods continue to use the proxy URL. +# +# Expected behaviour: +# - The Job container has SSG_DATABASE_JDBC_URL set to the override URL. +# - "?createDatabaseIfNotExist=false" is appended to the override URL so a +# misconfigured URL fails fast rather than silently creating a new database. +# - The main database.jdbcUrl is NOT used by the Job container. +apiVersion: security.brcmlabs.com/v1 +kind: Gateway +metadata: + name: ssg-migration-jdbcurl +spec: + version: "11.2.1" + license: + accept: true + secretName: gateway-license + app: + replicas: 1 + image: docker.io/caapim/gateway:11.2.1 + imagePullPolicy: IfNotPresent + management: + cluster: + hostname: gateway.brcmlabs.com + username: admin + password: 7layer + database: + enabled: true + # Proxy URL — used by live Gateway pods, NOT by the migration Job. + jdbcUrl: jdbc:mysql://no-mysql-proxy.local:3306/ssg + migrationJob: + enabled: true + # Direct primary writer URL — used exclusively by the migration Job. + jdbcUrl: jdbc:mysql://no-mysql-primary.local:3306/ssg + service: + enabled: true + type: ClusterIP + ports: + - name: https + port: 8443 + targetPort: 8443 + - name: management + port: 9443 + targetPort: 9443 diff --git a/tests/e2e/gateway-migration-job/00-assert.yaml b/tests/e2e/gateway-migration-job/00-assert.yaml new file mode 100644 index 00000000..42dedfd3 --- /dev/null +++ b/tests/e2e/gateway-migration-job/00-assert.yaml @@ -0,0 +1,50 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +# 60 s is enough for the operator to reconcile and create the Job. +# The Job pod itself will fail (no real MySQL), but the Job object is +# created synchronously by the operator — it does not depend on pod success. +timeout: 60 +--- +# Assert: a Job named "-db-migration" is created with the correct spec. +# +# Key fields verified: +# backoffLimit: 1 — operator allows exactly one pod retry before +# declaring terminal failure (requiring manual intervention) +# container.name: db-migration — canonical container name from NewMigrationJob +# container.image — same image as spec.app.image (Liquibase is bundled +# in the Gateway image itself) +# container.envFrom — ConfigMap and Secret both mounted via envFrom so the +# container inherits all Gateway env vars including +# SSG_DATABASE_USER and SSG_DATABASE_PASSWORD +# container.env[EXTRA_JAVA_ARGS] — liquibase-only mode (clearLocks=false) +# runs schema updates without releasing the lock first +# activeDeadlineSeconds: 300 — default deadline; Job pod is killed if it runs +# longer than this to prevent hung migrations +apiVersion: batch/v1 +kind: Job +metadata: + name: ssg-migration-job-db-migration + labels: + app: ssg-migration-job-db-migration +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + serviceAccountName: default + activeDeadlineSeconds: 300 + containers: + - name: db-migration + image: docker.io/caapim/gateway:11.2.1 + envFrom: + - configMapRef: + name: ssg-migration-job + - secretRef: + name: ssg-migration-job + env: + - name: EXTRA_JAVA_ARGS + value: "-Dgateway.db.schema-update.mode=liquibase-only" +# Note: Status.MigrationStatus.SpecHash (non-empty) is asserted in 01-assert.yaml +# via a command script because KUTTL YAML assertions cannot express "field is not +# empty". complete: false is the Go zero value and would match even if +# setMigrationStatus was never called, so it is not a useful assertion here. diff --git a/tests/e2e/gateway-migration-job/00-install.yaml b/tests/e2e/gateway-migration-job/00-install.yaml new file mode 100644 index 00000000..af4e2bdf --- /dev/null +++ b/tests/e2e/gateway-migration-job/00-install.yaml @@ -0,0 +1,45 @@ +# Test: migration enabled — operator creates the migration Job with the correct spec +# +# Scenario: migrationJob.enabled=true with no jdbcUrl override and clearLocks=false +# (both are the common defaults). The operator must create a Job whose container: +# - uses the Gateway image +# - has EXTRA_JAVA_ARGS set to liquibase-only mode +# - reads database credentials from the operator-managed Secret via envFrom +# +# The jdbcUrl points to a non-existent host intentionally. This test only asserts +# the Job object spec; it does not require a real MySQL instance. The Job pod will +# fail to connect but the operator still creates the Job object correctly. +apiVersion: security.brcmlabs.com/v1 +kind: Gateway +metadata: + name: ssg-migration-job +spec: + version: "11.2.1" + license: + accept: true + secretName: gateway-license + app: + replicas: 1 + image: docker.io/caapim/gateway:11.2.1 + imagePullPolicy: IfNotPresent + management: + cluster: + hostname: gateway.brcmlabs.com + username: admin + password: 7layer + database: + enabled: true + jdbcUrl: jdbc:mysql://no-mysql.local:3306/ssg + migrationJob: + enabled: true # ← migration on + clearLocks: false # default: liquibase-only mode + service: + enabled: true + type: ClusterIP + ports: + - name: https + port: 8443 + targetPort: 8443 + - name: management + port: 9443 + targetPort: 9443 diff --git a/tests/e2e/gateway-migration-job/01-assert.yaml b/tests/e2e/gateway-migration-job/01-assert.yaml new file mode 100644 index 00000000..edf1c34e --- /dev/null +++ b/tests/e2e/gateway-migration-job/01-assert.yaml @@ -0,0 +1,45 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +# Step 01 runs after step 00 confirmed the Job exists with the correct spec. +# At that point the operator has already executed setMigrationStatus in the +# same reconcile that created the Job, so the status write should already be +# visible. The 60s timeout matches step 00 as a safety margin for slow clusters. +timeout: 60 +commands: + - script: | + # Assert: Status.MigrationStatus.SpecHash is non-empty. + # + # YAML subset matching in 00-assert.yaml can only check that complete=false. + # It cannot express "specHash is not empty" because KUTTL has no "not empty" + # operator in YAML assertions. This command fills that gap. + # + # A non-empty specHash proves that setMigrationStatus was called on first + # enable, which is what allows all future 12-hour reconciles to use the fast + # path (skipping migration management when the spec has not changed). + HASH=$(kubectl get gateway ssg-migration-job \ + -n "$NAMESPACE" \ + -o jsonpath='{.status.migrationStatus.specHash}') + + if [ -z "$HASH" ]; then + echo "FAIL: Status.MigrationStatus.SpecHash is empty — setMigrationStatus was not called" + exit 1 + fi + + echo "PASS: Status.MigrationStatus.SpecHash is set to: $HASH" + - script: | + # Assert: the Deployment must NOT exist while the migration Job has not + # succeeded. This is the core guarantee of the whole migration-job + # feature — pkg/gateway/reconcile/deployment.go's Deployment() step + # gates on migrationComplete(), which is false here because the Job + # (pointed at the fake DB host) can only be pending or failed, never + # Succeeded. Neither state is exercised by the Job-spec assertion in + # 00-assert.yaml, so it would not catch a regression in that gate. + # + # If this ever fails, the gate in deployment.go has regressed and + # Gateway pods could roll out against an unmigrated database schema. + if kubectl get deployment ssg-migration-job -n "$NAMESPACE" >/dev/null 2>&1; then + echo "FAIL: Deployment must not exist while migration is pending/failed" + exit 1 + fi + + echo "PASS: Deployment correctly does not exist while migration is unresolved" From d186edeeb8d5b749ee48bc516aca10c1862b7d6c Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Fri, 3 Jul 2026 08:32:18 -0700 Subject: [PATCH 11/12] Migration job: run-to-completion reconcile, event-driven waits, pod spec parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reconcile loop no longer short-circuits while the migration job is pending — it runs every op to completion each pass and only stops on a genuine error. The Deployment step alone gates on Gateway.Status.MigrationStatus. - GatewayMigrationJob returns nil for all "not done yet" states instead of a synthetic ErrMigrationPending; progress is driven by the Job's own watch events (create/update/delete) rather than a fixed poll interval. - Fixed an edge case where a spec change (e.g. image upgrade) with no existing Job to delete could stall migration Job recreation for up to 12 hours — it now creates the replacement Job in the same reconcile pass when there's nothing to wait on. - Failure log message no longer embeds a kubectl command. - The migration Job's pod/container spec now inherits the same settings as the main Gateway Deployment (security contexts, resources, node selector, affinity, tolerations, topology spread constraints, pod annotations/labels) instead of a bare-minimum hand-built spec, and uses the same app.kubernetes.io/* labeling convention as other operator-managed resources. - Added a regression test for the "spec changed, no existing Job" case; existing migration job tests updated for the new nil-return contract. --- internal/controller/gateway/controller.go | 19 --- pkg/gateway/deployment.go | 11 +- pkg/gateway/migration_job.go | 47 ++++++-- pkg/gateway/podspec.go | 53 ++++++++ pkg/gateway/reconcile/migration_job.go | 65 ++++++---- pkg/gateway/reconcile/migration_job_test.go | 126 ++++++++++++++------ 6 files changed, 223 insertions(+), 98 deletions(-) create mode 100644 pkg/gateway/podspec.go diff --git a/internal/controller/gateway/controller.go b/internal/controller/gateway/controller.go index 928d0a59..76d23a9d 100644 --- a/internal/controller/gateway/controller.go +++ b/internal/controller/gateway/controller.go @@ -30,7 +30,6 @@ package gateway import ( "context" "encoding/json" - "errors" "fmt" "reflect" "strings" @@ -133,32 +132,14 @@ func (r *GatewayReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } start := time.Now() - migrationPending := false for _, op := range ops { if err = r.runOp(ctx, params, op); err != nil { - // ErrMigrationPending is a normal "not ready yet" state, not a failure. - // The Deployment op gates itself on Status.MigrationStatus, so the rest - // of the ops (ConfigMaps, Secrets, status, etc.) keep reconciling - // normally instead of being blocked behind the migration job. - if errors.Is(err, reconcile.ErrMigrationPending) { - migrationPending = true - continue - } _ = captureMetrics(ctx, params, start, true, op.Name) return ctrl.Result{}, err } } _ = captureMetrics(ctx, params, start, false, "") - - // Poll at a fixed short interval while migration is pending, rather than - // applying exponential backoff or waiting for the normal 12-hour cadence. - // This is a safety net alongside the Owns(&batchv1.Job{}) watch, which - // already triggers a reconcile as soon as the Job's status changes. - if migrationPending { - return ctrl.Result{RequeueAfter: 10 * time.Second}, nil - } - return ctrl.Result{RequeueAfter: 12 * time.Hour}, nil } diff --git a/pkg/gateway/deployment.go b/pkg/gateway/deployment.go index 67a4ceda..76ea3f2a 100644 --- a/pkg/gateway/deployment.go +++ b/pkg/gateway/deployment.go @@ -938,16 +938,7 @@ func NewDeployment(gw *securityv1.Gateway, platform string) *appsv1.Deployment { imagePullPolicy = gw.Spec.App.ImagePullPolicy } - gatewayContainerSecurityContext := corev1.SecurityContext{} - podSecurityContext := corev1.PodSecurityContext{} - - if gw.Spec.App.ContainerSecurityContext != (corev1.SecurityContext{}) { - gatewayContainerSecurityContext = gw.Spec.App.ContainerSecurityContext - } - - if !reflect.DeepEqual(gw.Spec.App.PodSecurityContext, corev1.PodSecurityContext{}) { - podSecurityContext = gw.Spec.App.PodSecurityContext - } + gatewayContainerSecurityContext, podSecurityContext := gatewaySecurityContexts(gw) gatewaySecretName := gw.Name if gw.Spec.App.Management.DisklessConfig.Disabled { gatewaySecretName = gw.Name + "-node-properties" diff --git a/pkg/gateway/migration_job.go b/pkg/gateway/migration_job.go index 80d42b51..a78b396b 100644 --- a/pkg/gateway/migration_job.go +++ b/pkg/gateway/migration_job.go @@ -30,6 +30,7 @@ import ( "strings" securityv1 "github.com/caapim/layer7-operator/api/v1" + "github.com/caapim/layer7-operator/pkg/util" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -138,6 +139,11 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { }) } + // Reuse the same SecurityContext derivation as the main Gateway Deployment so + // the migration pod is subject to the same PodSecurityContext/ContainerSecurityContext + // (e.g. runAsNonRoot, seccomp profiles) rather than running unconfined. + containerSecurityContext, podSecurityContext := gatewaySecurityContexts(gw) + container := corev1.Container{ Name: "db-migration", Image: gw.Spec.App.Image, @@ -145,6 +151,11 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { EnvFrom: envFrom, Env: envVars, VolumeMounts: volumeMounts, + SecurityContext: &containerSecurityContext, + Resources: corev1.ResourceRequirements{ + Requests: gw.Spec.App.Resources.Requests, + Limits: gw.Spec.App.Resources.Limits, + }, } var backoffLimit int32 = 1 @@ -168,25 +179,45 @@ func NewMigrationJob(gw *securityv1.Gateway) *batchv1.Job { } jobName := MigrationJobName(gw) + + // Use the same label convention as every other resource the operator creates + // (Deployment, Services, ConfigMaps, ...) so the migration Job/Pod remain + // discoverable and consistent with any customer-authored NetworkPolicy, + // monitoring, or tooling that selects on these labels. + jobLabels := util.DefaultLabels(gw.Name, gw.Spec.App.Labels) + jobLabels["app"] = jobName + + podLabels := util.DefaultLabels(gw.Name, gw.Spec.App.Labels) + podLabels["app"] = jobName + for k, v := range gw.Spec.App.PodLabels { + podLabels[k] = v + } + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, Namespace: gw.Namespace, - Labels: map[string]string{"app": jobName}, + Labels: jobLabels, }, Spec: batchv1.JobSpec{ BackoffLimit: &backoffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"app": jobName}, + Labels: podLabels, + Annotations: gw.Spec.App.PodAnnotations, }, Spec: corev1.PodSpec{ - ActiveDeadlineSeconds: &activeDeadlineSeconds, - ServiceAccountName: serviceAccountName, - RestartPolicy: corev1.RestartPolicyNever, - Containers: []corev1.Container{container}, - ImagePullSecrets: gw.Spec.App.ImagePullSecrets, - Volumes: volumes, + ActiveDeadlineSeconds: &activeDeadlineSeconds, + ServiceAccountName: serviceAccountName, + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{container}, + ImagePullSecrets: gw.Spec.App.ImagePullSecrets, + Volumes: volumes, + SecurityContext: &podSecurityContext, + NodeSelector: gw.Spec.App.NodeSelector, + Affinity: &gw.Spec.App.Affinity, + Tolerations: gw.Spec.App.Tolerations, + TopologySpreadConstraints: gw.Spec.App.TopologySpreadConstraints, }, }, }, diff --git a/pkg/gateway/podspec.go b/pkg/gateway/podspec.go new file mode 100644 index 00000000..5d56e64d --- /dev/null +++ b/pkg/gateway/podspec.go @@ -0,0 +1,53 @@ +/* +* Copyright (c) 2026 Broadcom. All rights reserved. +* The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. +* All trademarks, trade names, service marks, and logos referenced +* herein belong to their respective companies. +* +* This software and all information contained therein is confidential +* and proprietary and shall not be duplicated, used, disclosed or +* disseminated in any way except as authorized by the applicable +* license agreement, without the express written permission of Broadcom. +* All authorized reproductions must be marked with this language. +* +* EXCEPT AS SET FORTH IN THE APPLICABLE LICENSE AGREEMENT, TO THE +* EXTENT PERMITTED BY APPLICABLE LAW OR AS AGREED BY BROADCOM IN ITS +* APPLICABLE LICENSE AGREEMENT, BROADCOM PROVIDES THIS DOCUMENTATION +* "AS IS" WITHOUT WARRANTY OF ANY KIND, INCLUDING WITHOUT LIMITATION, +* ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +* PURPOSE, OR. NONINFRINGEMENT. IN NO EVENT WILL BROADCOM BE LIABLE TO +* THE END USER OR ANY THIRD PARTY FOR ANY LOSS OR DAMAGE, DIRECT OR +* INDIRECT, FROM THE USE OF THIS DOCUMENTATION, INCLUDING WITHOUT LIMITATION, +* LOST PROFITS, LOST INVESTMENT, BUSINESS INTERRUPTION, GOODWILL, OR +* LOST DATA, EVEN IF BROADCOM IS EXPRESSLY ADVISED IN ADVANCE OF THE +* POSSIBILITY OF SUCH LOSS OR DAMAGE. +* +* AI assistance has been used to generate some or all contents of this file. That includes, but is not limited to, new code, modifying existing code, stylistic edits. + */ +package gateway + +import ( + "reflect" + + securityv1 "github.com/caapim/layer7-operator/api/v1" + corev1 "k8s.io/api/core/v1" +) + +// gatewaySecurityContexts returns the effective container and pod SecurityContext +// for Gateway-owned pods (the Deployment and the pre-upgrade migration Job), +// falling back to the zero value when the user hasn't configured one. Shared so +// both call sites stay in sync as new pod-level settings are added. +func gatewaySecurityContexts(gw *securityv1.Gateway) (corev1.SecurityContext, corev1.PodSecurityContext) { + containerSecurityContext := corev1.SecurityContext{} + podSecurityContext := corev1.PodSecurityContext{} + + if gw.Spec.App.ContainerSecurityContext != (corev1.SecurityContext{}) { + containerSecurityContext = gw.Spec.App.ContainerSecurityContext + } + + if !reflect.DeepEqual(gw.Spec.App.PodSecurityContext, corev1.PodSecurityContext{}) { + podSecurityContext = gw.Spec.App.PodSecurityContext + } + + return containerSecurityContext, podSecurityContext +} diff --git a/pkg/gateway/reconcile/migration_job.go b/pkg/gateway/reconcile/migration_job.go index bbceca30..6a38aa7f 100644 --- a/pkg/gateway/reconcile/migration_job.go +++ b/pkg/gateway/reconcile/migration_job.go @@ -29,7 +29,6 @@ package reconcile import ( "context" "crypto/sha256" - "errors" "fmt" "strconv" @@ -43,13 +42,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -// ErrMigrationPending is returned by GatewayMigrationJob when the migration job -// exists and has not yet completed (created, replaced, or still active). The -// controller treats this as a normal "not ready yet" state: it requeues with a -// fixed short interval rather than applying exponential backoff or incrementing -// failure metrics. -var ErrMigrationPending = errors.New("migration pending") - // GatewayMigrationJob manages the lifecycle of the pre-upgrade database migration // job. It uses Gateway.Status.MigrationStatus as the authoritative source of truth // for whether the migration has completed, so that: @@ -59,7 +51,11 @@ var ErrMigrationPending = errors.New("migration pending") // - A spec change (image upgrade, jdbcUrl correction, clearLocks toggle, etc.) // resets the status and triggers a fresh migration run automatically. // -// The Deployment step is only unblocked when Status.MigrationStatus.Complete is true. +// The function always returns nil unless it encounters a real error. Waiting states +// (Job created, Job active, Job being replaced) return nil and rely on the +// controller's Owns(&batchv1.Job{}) watch to re-trigger reconciliation as soon as +// the Job's status changes. The Deployment step is only unblocked when +// Status.MigrationStatus.Complete is true. func GatewayMigrationJob(ctx context.Context, params Params) error { migrationEnabled := params.Instance.Spec.App.Management.Database.MigrationJob.Enabled databaseEnabled := params.Instance.Spec.App.Management.Database.Enabled @@ -101,8 +97,8 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { // round-trip and removes a swallowed-error opportunity. // // Background propagation (Fix 2): removes the Job object from etcd immediately - // so the next reconcile's Get cannot return the old (stale) object while it - // is mid-teardown, closing the foreground-deletion timing race. + // so a subsequent Get cannot return the old (stale) object while it is + // mid-teardown, closing the foreground-deletion timing race. if currentStatus.SpecHash != "" && currentStatus.SpecHash != desiredHash { params.Log.Info("migration spec changed, replacing job", "oldHash", currentStatus.SpecHash, @@ -112,13 +108,29 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { Namespace: params.Instance.Namespace, }} propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationBackground) - if delErr := params.Client.Delete(ctx, staleJob, propagationPolicy); client.IgnoreNotFound(delErr) != nil { + delErr := params.Client.Delete(ctx, staleJob, propagationPolicy) + if delErr != nil && !k8serrors.IsNotFound(delErr) { return fmt.Errorf("failed deleting stale migration job: %w", delErr) } if err := setMigrationStatus(ctx, params, securityv1.MigrationStatus{SpecHash: desiredHash}); err != nil { return err } - return fmt.Errorf("%w: migration spec changed, requeueing to create new job", ErrMigrationPending) + // Keep the local copy in sync so the first-enable status write further + // down (which checks for an empty SpecHash) doesn't run redundantly. + currentStatus.SpecHash = desiredHash + + if delErr == nil { + // A Job existed and was just deleted. Rely on the deletion event (Owns + // watch) to retrigger reconciliation rather than reading it back here — + // an immediate Get could still return the stale cached object before + // the informer observes the delete. + return nil + } + // No Job existed to delete (e.g. it was manually removed, or garbage + // collected, after a previous migration completed). There is no deletion + // event to wait for in that case, so fall through and create the + // replacement Job in this same pass instead of stalling until the next + // spec change or the 12-hour periodic reconcile. } // Build the desired Job now — needed both for creation and for the live @@ -145,7 +157,9 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { return fmt.Errorf("failed creating migration job: %w", err) } params.Log.Info("created migration job", "name", desiredJob.Name, "namespace", desiredJob.Namespace) - return fmt.Errorf("%w: migration job started, waiting for completion", ErrMigrationPending) + // Return nil; the Job's status changes will re-trigger reconciliation via + // the Owns(&batchv1.Job{}) watch. + return nil } else if err != nil { return err } @@ -154,8 +168,9 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { // deletion — its Succeeded/Failed counts still reflect the previous run. // Background deletion removes the object immediately, but a webhook delay or // brief informer lag can still deliver a stale cached object for a short window. + // Return nil; the Job's full deletion event will re-trigger reconciliation. if currentJob.DeletionTimestamp != nil { - return fmt.Errorf("%w: migration job is being deleted, waiting", ErrMigrationPending) + return nil } // Sanity-check the live Job against the desired spec before reading @@ -167,15 +182,15 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { // stale spec that somehow escaped hash-based detection (e.g., a field not yet // covered by migrationSpecHash); replacing it is the safe fallback. // - // In both cases: delete with Background propagation and return ErrMigrationPending - // so the next reconcile creates a fresh Job. + // In both cases: delete with Background propagation and return nil so the + // Job deletion event re-triggers reconciliation to create a fresh Job. if len(currentJob.Spec.Template.Spec.Containers) == 0 || currentJob.Spec.Template.Spec.Containers[0].Image != desiredJob.Spec.Template.Spec.Containers[0].Image { params.Log.Info("replacing malformed or stale migration job", "name", currentJob.Name, "namespace", currentJob.Namespace) propagationPolicy := client.PropagationPolicy(metav1.DeletePropagationBackground) _ = client.IgnoreNotFound(params.Client.Delete(ctx, currentJob, propagationPolicy)) - return fmt.Errorf("%w: replacing malformed or stale migration job", ErrMigrationPending) + return nil } // Job succeeded: persist completion in status so future reconciles skip migration. @@ -198,17 +213,17 @@ func GatewayMigrationJob(ctx context.Context, params Params) error { // pod-2 (the retry) is running. Declaring failure then would block a healthy // retry and require unnecessary manual intervention. if currentJob.Status.Active == 0 && currentJob.Status.Failed > 0 { - params.Log.Info("migration job failed — Gateway deployment is blocked. Investigate logs then delete the job to retry", + params.Log.Info("migration job failed — delete the job to retry after resolving the root cause", "name", currentJob.Name, - "namespace", currentJob.Namespace, - "fix", "kubectl delete job "+currentJob.Name+" -n "+currentJob.Namespace) + "namespace", currentJob.Namespace) return fmt.Errorf("migration job %s/%s failed — Gateway deployment blocked pending manual intervention", currentJob.Namespace, currentJob.Name) } - // Job is still running (or briefly between pod attempts). Return ErrMigrationPending - // so the controller requeues at a fixed interval without incrementing failure metrics. - params.Log.Info("migration job is active, waiting...") - return fmt.Errorf("%w: migration job is active", ErrMigrationPending) + // Job is still running (or briefly between pod attempts). Return nil so the + // reconcile loop completes normally; the Job completion event will re-trigger. + params.Log.V(2).Info("migration job is active, waiting for completion", + "name", currentJob.Name, "namespace", currentJob.Namespace) + return nil } // migrationComplete reports whether the pre-upgrade migration job (if enabled) diff --git a/pkg/gateway/reconcile/migration_job_test.go b/pkg/gateway/reconcile/migration_job_test.go index 8f0e4fde..cbf67da0 100644 --- a/pkg/gateway/reconcile/migration_job_test.go +++ b/pkg/gateway/reconcile/migration_job_test.go @@ -28,7 +28,6 @@ package reconcile import ( "context" - "errors" "testing" securityv1 "github.com/caapim/layer7-operator/api/v1" @@ -329,13 +328,12 @@ func TestGatewayMigrationJob(t *testing.T) { // ── first enable ───────────────────────────────────────────────────────── - t.Run("first enable: Job created, SpecHash written, ErrMigrationPending returned", func(t *testing.T) { + t.Run("first enable: Job created, SpecHash written, returns nil", func(t *testing.T) { // Input: Gateway with migration freshly enabled; empty Status.MigrationStatus; // no existing Job in the cluster. // Output: - // - GatewayMigrationJob returns an error wrapping ErrMigrationPending. - // (ErrMigrationPending is a sentinel: "not done yet, check back in 10s". - // It is not a real failure — the controller requeues without backoff.) + // - GatewayMigrationJob returns nil (reconcile completes; the Owns watch + // re-triggers reconciliation when the Job's status changes). // - A Job named "-db-migration" exists in the cluster. // - Status.MigrationStatus.SpecHash is non-empty (hash persisted to etcd). // - Status.MigrationStatus.Complete is false (migration not yet finished). @@ -344,8 +342,8 @@ func TestGatewayMigrationJob(t *testing.T) { err := GatewayMigrationJob(ctx, p) - if !errors.Is(err, ErrMigrationPending) { - t.Fatalf("expected ErrMigrationPending on first enable, got: %v", err) + if err != nil { + t.Fatalf("expected nil on first enable, got: %v", err) } // Verify the Job was created. @@ -402,10 +400,11 @@ func TestGatewayMigrationJob(t *testing.T) { // ── job lifecycle: running ──────────────────────────────────────────────── - t.Run("active job: returns ErrMigrationPending without modifying the Job", func(t *testing.T) { + t.Run("active job: returns nil without modifying the Job", func(t *testing.T) { // Input: Gateway with an existing Job that reports Active=1 (pod is running). // Status.MigrationStatus.SpecHash matches the current desired hash. - // Output: GatewayMigrationJob returns ErrMigrationPending. + // Output: GatewayMigrationJob returns nil (reconcile completes normally). + // The Owns watch re-triggers when the Job finishes. // The existing Job is neither deleted nor recreated. p := migrationParams("mj-active-job") createGateway(t, p) @@ -423,8 +422,8 @@ func TestGatewayMigrationJob(t *testing.T) { err := GatewayMigrationJob(ctx, p) - if !errors.Is(err, ErrMigrationPending) { - t.Fatalf("expected ErrMigrationPending while job is active, got: %v", err) + if err != nil { + t.Fatalf("expected nil while job is active, got: %v", err) } // Job must still exist and be unmodified. @@ -449,9 +448,10 @@ func TestGatewayMigrationJob(t *testing.T) { p := migrationParams("mj-succeeded") createGateway(t, p) - // First call: creates the Job and writes SpecHash to status. - if firstErr := GatewayMigrationJob(ctx, p); !errors.Is(firstErr, ErrMigrationPending) { - t.Fatalf("first call should return ErrMigrationPending, got: %v", firstErr) + // First call: creates the Job and writes SpecHash to status. Returns nil + // so the reconcile completes; the Owns watch fires on Job status changes. + if firstErr := GatewayMigrationJob(ctx, p); firstErr != nil { + t.Fatalf("first call should return nil, got: %v", firstErr) } // Retrieve the created Job and simulate success by patching its status. @@ -480,11 +480,11 @@ func TestGatewayMigrationJob(t *testing.T) { // ── job lifecycle: failed ───────────────────────────────────────────────── - t.Run("failed job: returns a blocking error (not ErrMigrationPending)", func(t *testing.T) { + t.Run("failed job: returns a blocking error", func(t *testing.T) { // Input: Gateway whose existing Job reports Active=0, Failed=1 // (both pod attempts exhausted; backoffLimit=1 reached). - // Output: GatewayMigrationJob returns a non-nil error that does NOT wrap - // ErrMigrationPending. The controller applies backoff and metrics. + // Output: GatewayMigrationJob returns a non-nil error. The controller + // applies exponential backoff and records failure metrics. // The Job is NOT deleted — the user must fix the root cause // (restore DB to pre-upgrade state if partially migrated, fix // credentials, etc.) and then delete the Job manually to retry. @@ -501,13 +501,9 @@ func TestGatewayMigrationJob(t *testing.T) { err := GatewayMigrationJob(ctx, p) - // Must be a real error (not the "pending" sentinel). if err == nil { t.Fatal("expected a blocking error for failed job, got nil") } - if errors.Is(err, ErrMigrationPending) { - t.Fatalf("expected a real blocking error, not ErrMigrationPending — got: %v", err) - } // Job must still exist (user must delete it manually to retry). got := &batchv1.Job{} @@ -520,14 +516,14 @@ func TestGatewayMigrationJob(t *testing.T) { // ── spec change ─────────────────────────────────────────────────────────── - t.Run("spec changed: old Job deleted, status reset to new hash, ErrMigrationPending returned", func(t *testing.T) { + t.Run("spec changed: old Job deleted, status reset to new hash, returns nil", func(t *testing.T) { // Input: Gateway previously migrated with image v1 (Complete=true, SpecHash=H1). // User changes spec.app.image to v2, making desiredHash=H2 != H1. // Output: // - The old Job is deleted from the cluster. // - Status.MigrationStatus is reset to {SpecHash: H2, Complete: false}. - // - GatewayMigrationJob returns ErrMigrationPending so the controller - // requeues in 10s, at which point it will create the new Job. + // - GatewayMigrationJob returns nil; the Job deletion event (Owns watch) + // re-triggers reconciliation, which then creates the new Job. // Why the delete-before-status-write ordering matters: if the status were // written first and the deletion then failed, the new hash would already be // persisted, making this branch unreachable on the next reconcile — a silent, @@ -562,8 +558,8 @@ func TestGatewayMigrationJob(t *testing.T) { err := GatewayMigrationJob(ctx, p) - if !errors.Is(err, ErrMigrationPending) { - t.Fatalf("expected ErrMigrationPending after spec change, got: %v", err) + if err != nil { + t.Fatalf("expected nil after spec change, got: %v", err) } // Old Job must be gone (Background deletion removes it from etcd immediately). @@ -585,6 +581,64 @@ func TestGatewayMigrationJob(t *testing.T) { } }) + t.Run("spec changed with no existing Job: replacement Job is created in the same call", func(t *testing.T) { + // Input: Gateway previously migrated with image v1 (Complete=true, SpecHash=H1), + // but the v1 Job is no longer in the cluster (e.g. manually deleted or + // garbage collected after completion). User changes spec.app.image to v2. + // Output: + // - There is no old Job to delete, so no deletion event exists to retrigger + // reconciliation. GatewayMigrationJob must fall through and create the + // new Job itself in this same call instead of waiting up to 12h for the + // next reconcile. + // - Status.MigrationStatus is reset to {SpecHash: H2, Complete: false}. + p := migrationParams("mj-spec-change-no-job") + p.Instance.Spec.App.Image = "gateway:11.1.1" + createGateway(t, p) + + hashV1 := migrationSpecHash(p.Instance) + + // Seed status as if migration with image v1 already completed, but do NOT + // create a Job — it represents one that was already cleaned up. + seedMigrationStatus(t, p, securityv1.MigrationStatus{ + SpecHash: hashV1, + Complete: true, + }) + + p.Instance.Spec.App.Image = "gateway:11.3.0" + if err := k8sClient.Update(ctx, p.Instance); err != nil { + t.Fatalf("failed to persist image upgrade: %v", err) + } + + err := GatewayMigrationJob(ctx, p) + + if err != nil { + t.Fatalf("expected nil after spec change with no existing job, got: %v", err) + } + + // The replacement Job for the new spec must exist — created in this same + // call, since there was no deletion event to wait for. + newJob := &batchv1.Job{} + getErr := k8sClient.Get(ctx, types.NamespacedName{ + Name: gateway.MigrationJobName(p.Instance), + Namespace: p.Instance.Namespace, + }, newJob) + if getErr != nil { + t.Fatalf("expected replacement Job to be created, got error: %v", getErr) + } + if got := newJob.Spec.Template.Spec.Containers[0].Image; got != "gateway:11.3.0" { + t.Errorf("replacement Job image: got %q, want %q", got, "gateway:11.3.0") + } + t.Cleanup(func() { _ = k8sClient.Delete(context.Background(), newJob) }) + + hashV2 := migrationSpecHash(p.Instance) + if p.Instance.Status.MigrationStatus.SpecHash != hashV2 { + t.Errorf("SpecHash: got %q, want %q", p.Instance.Status.MigrationStatus.SpecHash, hashV2) + } + if p.Instance.Status.MigrationStatus.Complete { + t.Error("Complete must be false after spec change — migration not yet run for new spec") + } + }) + // ── disabled cleanup ────────────────────────────────────────────────────── t.Run("disabled cleanup: orphaned Job is deleted when migration toggled off", func(t *testing.T) { @@ -625,13 +679,13 @@ func TestGatewayMigrationJob(t *testing.T) { // ── stale / malformed job guard ─────────────────────────────────────────── - t.Run("stale image job: Job with wrong image is replaced, ErrMigrationPending returned", func(t *testing.T) { + t.Run("stale image job: Job with wrong image is replaced, returns nil", func(t *testing.T) { // Input: An existing Job was built with image "old-image:1.0" but the // current Gateway spec still carries the same spec hash (the image // change somehow escaped hash detection). The live Job's image does // not match the image in the freshly-built desired Job. - // Output: The stale Job is deleted and ErrMigrationPending is returned so - // the next reconcile creates a valid replacement. + // Output: The stale Job is deleted and nil is returned; the Job deletion + // event (Owns watch) re-triggers reconciliation to create the replacement. // This guard also covers the zero-container case (a Job whose containers // were cleared by a manual edit or a mutating webhook after creation — // something the API server admission webhook would not catch post-creation). @@ -653,8 +707,8 @@ func TestGatewayMigrationJob(t *testing.T) { err := GatewayMigrationJob(ctx, p) - if !errors.Is(err, ErrMigrationPending) { - t.Fatalf("expected ErrMigrationPending when replacing stale-image job, got: %v", err) + if err != nil { + t.Fatalf("expected nil when replacing stale-image job, got: %v", err) } got := &batchv1.Job{} getErr := k8sClient.Get(ctx, types.NamespacedName{ @@ -672,7 +726,7 @@ func TestGatewayMigrationJob(t *testing.T) { // delete request but the object is still visible because a finalizer // is preventing immediate removal). The Job's status still shows // Succeeded=1 from the previous run — stale data. - // Output: GatewayMigrationJob returns ErrMigrationPending and does NOT write + // Output: GatewayMigrationJob returns nil and does NOT write // Complete=true to Status.MigrationStatus. // Why: Background deletion removes the Job from etcd immediately, but a // brief informer-cache lag or webhook delay can deliver a stale cached @@ -718,10 +772,10 @@ func TestGatewayMigrationJob(t *testing.T) { // ── Assert: the DeletionTimestamp guard fires ───────────────────────────── // // The guard is the check inside GatewayMigrationJob that inspects - // currentJob.DeletionTimestamp after the Get. It must return - // ErrMigrationPending instead of falling through to the Succeeded check. - if !errors.Is(err, ErrMigrationPending) { - t.Fatalf("DeletionTimestamp guard must return ErrMigrationPending, got: %v", err) + // currentJob.DeletionTimestamp after the Get. It must return nil (not + // an error) and must not fall through to the Succeeded check. + if err != nil { + t.Fatalf("DeletionTimestamp guard must return nil, got: %v", err) } // The guard must also prevent setMigrationStatus from being called with From 6059e79857728a4f7c66e81bc24933f8bfcb4c09 Mon Sep 17 00:00:00 2001 From: jchanbcbc Date: Fri, 3 Jul 2026 18:51:50 -0700 Subject: [PATCH 12/12] Fix to accept forward slash in the branch name. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b45adc76..f5ab896d 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,7 +34,7 @@ pipeline { if [[ "${BRANCH_NAME}" = "develop" ]]; then export IMAGE_TAG="latest" else - export RELEASE_VERSION="${BRANCH_NAME}" + export RELEASE_VERSION="${BRANCH_NAME//\//-}" fi fi