From 2721bba42b669a5c938973d4fc771d15db5e0db0 Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Fri, 29 May 2026 17:19:13 +0100 Subject: [PATCH] Fix Progressing=True being immediately overwritten by Available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sync() returned nil on success, so Reconcile always called setStatusAvailable immediately after setStatusProgressing. Change sync to return (bool, error) so Reconcile can skip setStatusAvailable when resources were just updated. Also fix applyResources overwriting the updated flag each iteration — only the last resource's status was returned. --- pkg/controllers/clusteroperator_controller.go | 29 +++++---- .../clusteroperator_controller_test.go | 65 +++++++++++++++++++ 2 files changed, 83 insertions(+), 11 deletions(-) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index a5fefa1f3..94f7f1950 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -121,11 +121,16 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to build operator config: %w", err)) } - if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { + progressing, err := r.sync(ctx, operatorConfig, conditionOverrides) + if err != nil { klog.Errorf("Unable to sync operands: %s", err) return ctrl.Result{}, fmt.Errorf("failed to sync operands: %w", err) } + if progressing { + return ctrl.Result{}, nil + } + if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err) @@ -143,33 +148,35 @@ func (r *CloudOperatorReconciler) clearFailureWindow() { r.failures.clear() } -func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { - // Deploy resources for platform +func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) (bool, error) { resources, err := cloud.GetResources(config) if err != nil { - return err + return false, err } updated, err := r.applyResources(ctx, resources) if err != nil { - return err + return false, err } if updated { - return r.setStatusProgressing(ctx, conditionOverrides) + if err := r.setStatusProgressing(ctx, conditionOverrides); err != nil { + return false, err + } + return true, nil } - return nil + return false, nil } // applyResources will apply all resources as-is to the cluster, allowing adding of custom annotations and lables func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources []client.Object) (bool, error) { - updated := false - var err error + anyUpdated := false for _, resource := range resources { - updated, err = resourceapply.ApplyResource(ctx, r.Client, r.Recorder, resource) + updated, err := resourceapply.ApplyResource(ctx, r.Client, r.Recorder, resource) if err != nil { return false, err } + anyUpdated = anyUpdated || updated if err := r.watcher.Watch(ctx, resource); err != nil { klog.Errorf("Unable to establish watch on object %s '%s': %+v", resource.GetObjectKind().GroupVersionKind(), resource.GetName(), err) @@ -182,7 +189,7 @@ func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources klog.V(2).Info("Resources applied successfully.") } - return updated, nil + return anyUpdated, nil } // SetupWithManager sets up the controller with the Manager. diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index ae93f1513..c47d8c458 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -605,6 +605,71 @@ var _ = Describe("Apply resources should", func() { Expect(dep.Labels[common.CloudControllerManagerProviderLabel]).To(Equal("AWS")) }) + It("reports updated=true when only a non-final resource changed", func() { + operatorConfig := getConfigForPlatform(&configv1.PlatformStatus{Type: configv1.AWSPlatformType}) + awsResources, err := cloud.GetResources(operatorConfig) + Expect(err).To(Succeed()) + Expect(len(awsResources)).To(BeNumerically(">=", 2)) + + resources = append(resources, awsResources...) + + updated, err := reconciler.applyResources(context.TODO(), resources) + Expect(err).ShouldNot(HaveOccurred()) + Expect(updated).To(BeTrue()) + + // Drain recorder events from initial creation + for len(recorder.Events) > 0 { + <-recorder.Events + } + + // Modify only the deployment so it gets updated, while later resources remain unchanged + for i, res := range resources { + if dep, ok := res.(*appsv1.Deployment); ok { + dep.Spec.Replicas = ptr.To[int32](99) + resources[i] = dep + break + } + } + + updated, err = reconciler.applyResources(context.TODO(), resources) + Expect(err).ShouldNot(HaveOccurred()) + Expect(updated).To(BeTrue(), "should report updated even when only a non-final resource changed") + }) + + It("should set Progressing=True on first sync and not signal progressing when resources are stable", func() { + reconciler.ManagedNamespace = DefaultManagedNamespace + + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(context.TODO(), co)).To(Succeed()) + + operatorConfig := getConfigForPlatform(&configv1.PlatformStatus{Type: configv1.AWSPlatformType}) + awsResources, err := cloud.GetResources(operatorConfig) + Expect(err).To(Succeed()) + resources = append(resources, awsResources...) + + // First sync: resources do not yet exist, so applyResources reports updated=true. + progressing, err := reconciler.sync(context.TODO(), operatorConfig, nil) + Expect(err).To(Succeed()) + Expect(progressing).To(BeTrue(), "sync should report progressing when resources are newly applied") + + Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + Equal(configv1.ConditionTrue), "Progressing should be True after resources are first applied", + ) + + // Second sync: resources exist and are unchanged, so applyResources reports updated=false. + progressing, err = reconciler.sync(context.TODO(), operatorConfig, nil) + Expect(err).To(Succeed()) + Expect(progressing).To(BeFalse(), "sync should not report progressing when resources are already up to date") + + // Progressing remains True because sync() does not clear it; only setStatusAvailable() does. + Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + Equal(configv1.ConditionTrue), "Progressing should remain True until setStatusAvailable is called", + ) + }) + AfterEach(func() { co := &configv1.ClusterOperator{} err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)